iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp‎
Lines changed: 179 additions & 11 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp‎
Lines changed: 179 additions & 11 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUInferMemorySpace.cpp‎
Lines changed: 2 additions & 1 deletion b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUInferMemorySpace.cpp‎
Lines changed: 2 additions & 1 deletion
@@ -23,6 +23,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -58,6 +59,15 @@ static SmallVector<Attribute> getThreadMapping(MLIRContext *ctx) {
   return mapping;
 }
 
+/// Trace through extract_slice operations to find an underlying tensor.pad.
+/// Returns the PadOp if found, nullptr otherwise.
+static tensor::PadOp traceToTensorPad(Value source) {
+  while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
+    source = extractSlice.getSource();
+  }
+  return source.getDefiningOp<tensor::PadOp>();
+}
+
 /// Check if a value traces back to tensor.empty (possibly through forall args).
 static bool tracesToTensorEmpty(Value value) {
   // Direct tensor.empty.
@@ -300,14 +310,74 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
 
   Location loc = innerOp.getLoc();
   Value source, indices;
+  SmallVector<bool> inBoundsVec;
 
   // Extract source and indices based on op type.
   if constexpr (std::is_same_v<OpTy, linalg::CopyOp>) {
     Value input = innerOp.getInputs()[0];
-    if (auto extractSlice = input.getDefiningOp<tensor::ExtractSliceOp>()) {
-      source = extractSlice.getSource();
-    } else {
-      return failure();
+
+    // After tiling, the input is typically:
+    //   tensor.extract_slice %padded[...] [...] [1, 1]
+    // We need to trace through extract_slice to find if source is tensor.pad.
+    if (auto pad = traceToTensorPad(input)) {
+      // Verify pad constraints: low padding must be all zeros, pad value must
+      // be 0.
+      bool validPad = true;
+      for (OpFoldResult low : pad.getMixedLowPad()) {
+        if (!isConstantIntValue(low, 0)) {
+          validPad = false;
+          break;
+        }
+      }
+      Value padVal = pad.getConstantPaddingValue();
+      if (!padVal || !(matchPattern(padVal, m_AnyZeroFloat()) ||
+                       matchPattern(padVal, m_Zero()))) {
+        validPad = false;
+      }
+
+      if (validPad) {
+        // Use pad.getSource() directly as the DMA source.
+        // This is the tensor.extract_slice result (e.g., tensor<?x64xf32>).
+        source = pad.getSource();
+
+        // Check if source tensor's innermost row size is DWORD (4-byte)
+        // aligned. On AMD CDNA, per-component range checking is performed for
+        // each DWORD. If a DWORD is partially out-of-bounds, the entire DWORD
+        // returns zero, causing incorrect results. Additionally, partial OOB
+        // triggers the slow path with multi-cycling and instruction issue
+        // penalties.
+        auto sourceType = cast<RankedTensorType>(source.getType());
+        int64_t innermostDim = sourceType.getShape().back();
+        if (!ShapedType::isDynamic(innermostDim)) {
+          Type elemType = sourceType.getElementType();
+          int64_t elemBytes = elemType.getIntOrFloatBitWidth() / 8;
+          int64_t rowBytes = innermostDim * elemBytes;
+          if (rowBytes % 4 != 0) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Skipping DMA: row size " << rowBytes
+                       << " bytes not DWORD-aligned (slow path)\n");
+            return failure();
+          }
+        }
+
+        // Compute in_bounds based on whether padding was added per dimension.
+        for (auto [low, high] :
+             llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
+          bool isInBounds =
+              isConstantIntValue(low, 0) && isConstantIntValue(high, 0);
+          inBoundsVec.push_back(isInBounds);
+        }
+      }
+    }
+
+    // Fallback: original behavior without tensor.pad fusion.
+    // Only trace through ONE level of extract_slice (the immediate input).
+    if (!source) {
+      if (auto extractSlice = input.getDefiningOp<tensor::ExtractSliceOp>()) {
+        source = extractSlice.getSource();
+      } else {
+        return failure();
+      }
     }
   } else if constexpr (std::is_same_v<OpTy, IREE::LinalgExt::GatherOp>) {
     source = innerOp.getSource();
@@ -356,15 +426,22 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
 
   // Create the DMA op in the in_parallel region.
   rewriter.setInsertionPointToStart(&inParallelBlock);
-  SmallVector<Value, 1> indicesVec;
+  SmallVector<Value, 1> indicesOperands;
   if (indices) {
-    indicesVec.push_back(indices);
+    indicesOperands.push_back(indices);
+  }
+
+  // Create in_bounds attribute if we fused a tensor.pad.
+  ArrayAttr inBoundsAttr;
+  if (!inBoundsVec.empty()) {
+    inBoundsAttr = rewriter.getBoolArrayAttr(inBoundsVec);
   }
 
   // When used in forall.in_parallel, the op doesn't return a result
   // as it performs an in-place update to the shared_outs tensor.
   IREE::GPU::CoalescedGatherDMAOp::create(rewriter, loc, Type(), source,
-                                          indicesVec, sharedOut, laneId);
+                                          indicesOperands, sharedOut, laneId,
+                                          inBoundsAttr);
 
   // Erase the parallel_insert_slice ops and inner operation.
   for (tensor::ParallelInsertSliceOp &insertOp : toErase) {
@@ -421,6 +498,58 @@ struct ConvertCopyToCoalescedDMA
   }
 };
 
+/// Pattern to convert tensor.pad fusion cases directly without requiring
+/// warp-mapped forall parent.
+struct ConvertPadFusionCopyToCoalescedDMA
+    : public OpRewritePattern<linalg::CopyOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::CopyOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    // Only match copies with use_global_load_dma config
+    auto config = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copyOp);
+    if (!config) {
+      return failure();
+    }
+
+    // Check if this is a tensor.pad fusion case
+    auto pad = traceToTensorPad(copyOp.getInputs()[0]);
+    if (!pad) {
+      return failure(); // Not a pad fusion case
+    }
+
+    // Check if padding exists (non-zero low/high pad)
+    bool hasPadding = false;
+    for (auto [low, high] :
+         llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
+      if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
+        hasPadding = true;
+        break;
+      }
+    }
+    if (!hasPadding) {
+      return failure(); // No actual padding
+    }
+
+    // This is a tensor.pad fusion case. Convert directly to
+    // coalesced_gather_dma without requiring warp-mapped forall.
+    auto outputType = cast<RankedTensorType>(copyOp.getOutputs()[0].getType());
+    SmallVector<OpFoldResult> threadNumThreads =
+        computeThreadNumThreadsImpl(rewriter, copyOp, outputType);
+    if (threadNumThreads.empty()) {
+      return failure();
+    }
+
+    scf::ForallOp threadForallOp =
+        tileToThreadLevel(copyOp, rewriter, threadNumThreads);
+    if (!threadForallOp) {
+      return failure();
+    }
+
+    return createDMAInForall<linalg::CopyOp>(threadForallOp, rewriter);
+  }
+};
+
 struct ConvertGatherToCoalescedDMA
     : public OpRewritePattern<IREE::LinalgExt::GatherOp> {
   using OpRewritePattern<IREE::LinalgExt::GatherOp>::OpRewritePattern;
@@ -574,7 +703,8 @@ struct ConvertGatherToCoalescedDMA
     rewriter.setInsertionPointToStart(&inParallelBlock);
 
     IREE::GPU::CoalescedGatherDMAOp::create(rewriter, loc, Type(), source,
-                                            indicesVec, sharedOut, laneId);
+                                            indicesVec, sharedOut, laneId,
+                                            /*in_bounds=*/nullptr);
 
     // Erase parallel_insert_slice ops and gather op.
     SmallVector<tensor::ParallelInsertSliceOp> toErase;
@@ -605,9 +735,11 @@ struct GPUConvertToCoalescedDMAPass final
     }
 
     // Only tile and convert ops within forall ops with warp mapping.
+    // Also handle tensor.pad fusion cases that don't have warp mapping.
     RewritePatternSet patterns(context);
     patterns.add<ConvertGatherToCoalescedDMA>(context);
     patterns.add<ConvertCopyToCoalescedDMA>(context);
+    patterns.add<ConvertPadFusionCopyToCoalescedDMA>(context);
 
     walkAndApplyPatterns(funcOp, std::move(patterns));
   }
@@ -758,9 +890,42 @@ struct GPUConvertToCoalescedDMAPass final
       return failure();
     }
 
-    // Compute tile sizes for subgroup-level distribution.
-    auto [tileSizes, numTiledDims] =
-        computeSubgroupTileSizes(rewriter, shape, numWarps);
+    // Check if this is a tensor.pad fusion case.
+    bool isPadFusion = false;
+    if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
+      if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
+        // Check if padding exists (non-zero low/high pad)
+        for (auto [low, high] :
+             llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
+          if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
+            isPadFusion = true;
+            break;
+          }
+        }
+      }
+    }
+
+    SmallVector<OpFoldResult> tileSizes;
+    int64_t numTiledDims = 0;
+
+    if (isPadFusion) {
+      // For tensor.pad fusion, create a single-iteration wrapper forall
+      // by setting tile sizes to the full shape. This allows the DMA to
+      // operate on the full buffer while satisfying the warp-mapped parent
+      // requirement.
+      // Bail out if any dimension is dynamic since we need static tile sizes.
+      if (llvm::any_of(shape, ShapedType::isDynamic)) {
+        return failure();
+      }
+      for (int64_t i = 0; i < rank; ++i) {
+        tileSizes.push_back(rewriter.getIndexAttr(shape[i]));
+        ++numTiledDims;
+      }
+    } else {
+      // Compute tile sizes for subgroup-level distribution.
+      std::tie(tileSizes, numTiledDims) =
+          computeSubgroupTileSizes(rewriter, shape, numWarps);
+    }
 
     if (numTiledDims == 0) {
       return failure();
@@ -798,6 +963,9 @@ struct GPUConvertToCoalescedDMAPass final
     });
 
     // Apply subgroup-level tiling to each op.
+    // For tensor.pad fusion cases, tileAtSubgroupLevel creates a
+    // single-iteration wrapper forall to maintain the expected structure while
+    // allowing the DMA to operate on the full buffer.
     IRRewriter rewriter(context);
     for (Operation *op : opsToTile) {
       FailureOr<scf::SCFTilingResult> tilingResult =
 
@@ -47,7 +47,8 @@ bool isDefinitelyShared(bufferization::AllocTensorOp alloc) {
     auto forallOp = dyn_cast<scf::ForallOp>(user);
     if (!forallOp ||
         !forallOpHasMappingType<gpu::GPUThreadMappingAttr,
-                                gpu::GPUWarpMappingAttr>(forallOp)) {
+                                gpu::GPUWarpMappingAttr, IREE::GPU::LaneIdAttr>(
+            forallOp)) {
       return false;
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,8 @@ bool isDefinitelyShared(bufferization::AllocTensorOp alloc) {`
`47`	`47`	`auto forallOp = dyn_cast<scf::ForallOp>(user);`
`48`	`48`	`if (!forallOp \|\|`
`49`	`49`	`!forallOpHasMappingType<gpu::GPUThreadMappingAttr,`
`50`		`- gpu::GPUWarpMappingAttr>(forallOp)) {`
	`50`	`+ gpu::GPUWarpMappingAttr, IREE::GPU::LaneIdAttr>(`
	`51`	`+ forallOp)) {`
`51`	`52`	`return false;`
`52`	`53`	`}`
`53`	`54`	`}`