[GPU] Skip subgroup-level tiling for tensor.pad fusion in coalesced DMA

lialan · lialan · commit bed7be154aa3 · 2026-02-03T13:32:00.000-08:00
The subgroup-level tiling was creating an outer loop (1, 4, 64) that
distributed the padded buffer across multiple iterations, causing each
iteration to create 1×64 dest subviews. The lowering pass would then
use dest shape (1×64) for delinearization, causing all iterations to
load from source row 0 instead of rows 0-3.

This fix skips subgroup-level tiling for tensor.pad fusion cases by:
1. Detecting tensor.pad in applySubgroupTiling() before calling
   tileAtSubgroupLevel()
2. Adding a new ConvertPadFusionCopyToCoalescedDMA pattern that
   converts these operations directly without requiring warp-mapped
   forall parent

This allows coalesced_gather_dma to operate on full 4×64 buffers with
a single lane-mapped forall, letting the lowering pass correctly
generate 4 transfers per lane to cover all source rows.

Fixes unaligned matmul tests (65x64x121, 133x97x65).
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -475,6 +475,63 @@ struct ConvertCopyToCoalescedDMA
   }
 };
 
+/// Pattern to convert tensor.pad fusion cases directly without requiring
+/// warp-mapped forall parent.
+struct ConvertPadFusionCopyToCoalescedDMA
+    : public OpRewritePattern<linalg::CopyOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::CopyOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    // Only match copies with use_global_load_dma config
+    auto config = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copyOp);
+    if (!config) {
+      return failure();
+    }
+
+    // Check if this is a tensor.pad fusion case
+    Value source = copyOp.getInputs()[0];
+    // Trace through extract_slice to find tensor.pad
+    while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
+      source = extractSlice.getSource();
+    }
+    auto pad = source.getDefiningOp<tensor::PadOp>();
+    if (!pad) {
+      return failure(); // Not a pad fusion case
+    }
+
+    // Check if padding exists (non-zero low/high pad)
+    bool hasPadding = false;
+    for (auto [low, high] :
+         llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
+      if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
+        hasPadding = true;
+        break;
+      }
+    }
+    if (!hasPadding) {
+      return failure(); // No actual padding
+    }
+
+    // This is a tensor.pad fusion case. Convert directly to
+    // coalesced_gather_dma without requiring warp-mapped forall.
+    auto outputType = cast<RankedTensorType>(copyOp.getOutputs()[0].getType());
+    SmallVector<OpFoldResult> threadNumThreads =
+        computeThreadNumThreadsImpl(rewriter, copyOp, outputType);
+    if (threadNumThreads.empty()) {
+      return failure();
+    }
+
+    scf::ForallOp threadForallOp =
+        tileToThreadLevel(copyOp, rewriter, threadNumThreads);
+    if (!threadForallOp) {
+      return failure();
+    }
+
+    return createDMAInForall<linalg::CopyOp>(threadForallOp, rewriter);
+  }
+};
+
 struct ConvertGatherToCoalescedDMA
     : public OpRewritePattern<IREE::LinalgExt::GatherOp> {
   using OpRewritePattern<IREE::LinalgExt::GatherOp>::OpRewritePattern;
@@ -660,9 +717,11 @@ struct GPUConvertToCoalescedDMAPass final
     }
 
     // Only tile and convert ops within forall ops with warp mapping.
+    // Also handle tensor.pad fusion cases that don't have warp mapping.
     RewritePatternSet patterns(context);
     patterns.add<ConvertGatherToCoalescedDMA>(context);
     patterns.add<ConvertCopyToCoalescedDMA>(context);
+    patterns.add<ConvertPadFusionCopyToCoalescedDMA>(context);
 
     walkAndApplyPatterns(funcOp, std::move(patterns));
   }
@@ -854,6 +913,35 @@ struct GPUConvertToCoalescedDMAPass final
     // Apply subgroup-level tiling to each op.
     IRRewriter rewriter(context);
     for (Operation *op : opsToTile) {
+      // Check if this is a tensor.pad fusion case for CopyOp.
+      // If so, skip subgroup-level tiling to avoid creating the outer loop.
+      bool skipSubgroupTiling = false;
+      if (auto copyOp = dyn_cast<linalg::CopyOp>(op)) {
+        Value source = copyOp.getInputs()[0];
+        // Trace through extract_slice to find tensor.pad
+        while (auto extractSlice =
+                   source.getDefiningOp<tensor::ExtractSliceOp>()) {
+          source = extractSlice.getSource();
+        }
+        if (auto pad = source.getDefiningOp<tensor::PadOp>()) {
+          // Check if padding exists (non-zero low/high pad)
+          for (auto [low, high] :
+               llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
+            if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
+              skipSubgroupTiling = true;
+              break;
+            }
+          }
+        }
+      }
+
+      if (skipSubgroupTiling) {
+        // Skip subgroup-level tiling for tensor.pad fusion.
+        // The operation will be handled at thread-level tiling with full
+        // buffers.
+        continue;
+      }
+
       FailureOr<scf::SCFTilingResult> tilingResult =
           TypeSwitch<Operation *, FailureOr<scf::SCFTilingResult>>(op)
               .Case([&](linalg::CopyOp copyOp) {