update

lialan · lialan · commit efd0e547b5c3 · 2026-02-04T15:19:17.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -59,6 +59,15 @@ static SmallVector<Attribute> getThreadMapping(MLIRContext *ctx) {
   return mapping;
 }
 
+/// Trace through extract_slice operations to find an underlying tensor.pad.
+/// Returns the PadOp if found, nullptr otherwise.
+static tensor::PadOp traceToTensorPad(Value source) {
+  while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
+    source = extractSlice.getSource();
+  }
+  return source.getDefiningOp<tensor::PadOp>();
+}
+
 /// Check if a value traces back to tensor.empty (possibly through forall args).
 static bool tracesToTensorEmpty(Value value) {
   // Direct tensor.empty.
@@ -310,14 +319,7 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
     // After tiling, the input is typically:
     //   tensor.extract_slice %padded[...] [...] [1, 1]
     // We need to trace through extract_slice to find if source is tensor.pad.
-    Value traceSource = input;
-    while (auto extractSlice =
-               traceSource.getDefiningOp<tensor::ExtractSliceOp>()) {
-      traceSource = extractSlice.getSource();
-    }
-
-    // Check if we traced back to a tensor.pad.
-    if (auto pad = traceSource.getDefiningOp<tensor::PadOp>()) {
+    if (auto pad = traceToTensorPad(input)) {
       // Verify pad constraints: low padding must be all zeros, pad value must
       // be 0.
       bool validPad = true;
@@ -328,7 +330,8 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
         }
       }
       Value padVal = pad.getConstantPaddingValue();
-      if (!padVal || !matchPattern(padVal, m_AnyZeroFloat())) {
+      if (!padVal || !(matchPattern(padVal, m_AnyZeroFloat()) ||
+                       matchPattern(padVal, m_Zero()))) {
         validPad = false;
       }
 
@@ -490,12 +493,7 @@ struct ConvertPadFusionCopyToCoalescedDMA
     }
 
     // Check if this is a tensor.pad fusion case
-    Value source = copyOp.getInputs()[0];
-    // Trace through extract_slice to find tensor.pad
-    while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
-      source = extractSlice.getSource();
-    }
-    auto pad = source.getDefiningOp<tensor::PadOp>();
+    auto pad = traceToTensorPad(copyOp.getInputs()[0]);
     if (!pad) {
       return failure(); // Not a pad fusion case
     }
@@ -874,13 +872,7 @@ struct GPUConvertToCoalescedDMAPass final
     // Check if this is a tensor.pad fusion case.
     bool isPadFusion = false;
     if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
-      Value source = copyOp.getInputs()[0];
-      // Trace through extract_slice to find tensor.pad
-      while (auto extractSlice =
-                 source.getDefiningOp<tensor::ExtractSliceOp>()) {
-        source = extractSlice.getSource();
-      }
-      if (auto pad = source.getDefiningOp<tensor::PadOp>()) {
+      if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
         // Check if padding exists (non-zero low/high pad)
         for (auto [low, high] :
              llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
@@ -900,6 +892,10 @@ struct GPUConvertToCoalescedDMAPass final
       // by setting tile sizes to the full shape. This allows the DMA to
       // operate on the full buffer while satisfying the warp-mapped parent
       // requirement.
+      // Bail out if any dimension is dynamic since we need static tile sizes.
+      if (llvm::any_of(shape, ShapedType::isDynamic)) {
+        return failure();
+      }
       for (int64_t i = 0; i < rank; ++i) {
         tileSizes.push_back(rewriter.getIndexAttr(shape[i]));
         ++numTiledDims;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma))" %s --split-input-file | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma,canonicalize))" %s --split-input-file | FileCheck %s
 
 #gpu_target_copy = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
@@ -24,8 +24,8 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
   // With 16 warps (128*512/64/64) and 64 rows: step = ceil(64/16) = 4 rows, 512 cols (whole)
   // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 512) step (4, 512)
   // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x512xf32>) {
-  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]]] [4, 512] [1, 1]
-  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 512] [1, 1]
+  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [4, 512] [1, 1]
+  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [4, 512] [1, 1]
 
   // Thread-level forall:
   // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
@@ -37,7 +37,7 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
   // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
 
   // CHECK:   scf.forall.in_parallel {
-  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 512] [1, 1]
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [4, 512] [1, 1]
   // CHECK:   }
   // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
 
@@ -75,22 +75,20 @@ func.func @gather(%source: tensor<64x512xf32>, %indices: tensor<64xi32>, %init:
   // With 64 warps and 64 rows: step = ceil(64/64) = 1 row, 512 cols (whole)
   // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 512) step (1, 512)
   // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x512xf32>) {
-  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [1, 512] [1, 1]
+  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [1, 512] [1, 1]
   // CHECK-DAG:   %[[SLICE_INDICES:.+]] = tensor.extract_slice %[[INDICES]][%[[IV0]]] [1] [1]
-  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][0, %[[IV1]]] [64, 512] [1, 1]
 
   // Thread-level forall:
   // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
   // CHECK-SAME:   shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<1x512xf32>) {
-  // CHECK-DAG:     %[[SLICE_INDICES_INNER:.+]] = tensor.extract_slice %[[SLICE_INDICES]][0] [1] [1]
   // CHECK:     scf.forall.in_parallel {
-  // CHECK:       iree_gpu.coalesced_gather_dma %[[SLICE_SRC]][%[[SLICE_INDICES_INNER]]] into %[[THREAD_INIT]] lane(%[[LANE]])
+  // CHECK:       iree_gpu.coalesced_gather_dma %[[SRC]][%[[SLICE_INDICES]]] into %[[THREAD_INIT]] lane(%[[LANE]])
   // CHECK-SAME:       : tensor<64x512xf32>, tensor<1xi32>, tensor<1x512xf32>, index
   // CHECK:     }
   // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
 
   // CHECK:   scf.forall.in_parallel {
-  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [1, 512] [1, 1]
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [1, 512] [1, 1]
   // CHECK:   }
   // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
 
@@ -170,9 +168,9 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
   // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x128xf32>) {
 
   // Key check: subviews are 16x128 (contiguous) not 64x64 (non-contiguous)
-  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]]] [16, 128] [1, 1]
+  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [16, 128] [1, 1]
   // CHECK-SAME:   : tensor<64x128xf32> to tensor<16x128xf32>
-  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [16, 128] [1, 1]
+  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [16, 128] [1, 1]
   // CHECK-SAME:   : tensor<64x128xf32> to tensor<16x128xf32>
 
   // Thread-level forall distributes across lanes:
@@ -185,7 +183,7 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
   // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
 
   // CHECK:   scf.forall.in_parallel {
-  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [16, 128] [1, 1]
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [16, 128] [1, 1]
   // CHECK-SAME:     : tensor<16x128xf32> into tensor<64x128xf32>
   // CHECK:   }
   // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
@@ -231,9 +229,9 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
   // Warp-level forall: step (32, 16) distributes 128 rows across 4 warps
   // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (128, 16) step (32, 16)
   // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<128x16xf32>) {
-  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]]] [32, 16] [1, 1]
+  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [32, 16] [1, 1]
   // CHECK-SAME:   : tensor<128x16xf32> to tensor<32x16xf32>
-  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [32, 16] [1, 1]
+  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
   // CHECK-SAME:   : tensor<128x16xf32> to tensor<32x16xf32>
 
   // Thread-level forall with 64 lanes
@@ -246,7 +244,7 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
   // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
 
   // CHECK:   scf.forall.in_parallel {
-  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [32, 16] [1, 1]
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
   // CHECK-SAME:     : tensor<32x16xf32> into tensor<128x16xf32>
   // CHECK:   }
   // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
@@ -451,11 +449,13 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
 
   // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor
   // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding
-  // CHECK: scf.forall (%[[LANE:.+]]) in (64)
-  // CHECK:   scf.forall.in_parallel {
-  // CHECK:     iree_gpu.coalesced_gather_dma %[[EXTRACTED:.*]] into %{{.+}} lane(%[[LANE]]) in_bounds [false, true]
+  // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
+  // CHECK: scf.forall {{.*}} shared_outs(%[[OUTER_INIT:.+]] = %[[INIT]])
+  // CHECK:   scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[OUTER_INIT]])
+  // CHECK:     scf.forall.in_parallel {
+  // CHECK:       iree_gpu.coalesced_gather_dma %[[EXTRACTED]] into %[[INNER_INIT]] lane(%[[LANE]]) in_bounds [false, true]
   // CHECK-SAME:     : tensor<?x64xf32>, tensor<4x64xf32>, index
-  // CHECK:   }
+  // CHECK:     }
   // CHECK-NOT: tensor.pad
 
   return %result : tensor<4x64xf32>
@@ -504,24 +504,22 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
   // forall with step (1, 64) producing 4 iterations with 1x64 subviews.
   // For tensor.pad fusion, we instead create a single-iteration wrapper forall
   // with step (4, 64) - the full shape - so the DMA operates on 4x64 directly.
+  // After canonicalization, identity extract_slices are eliminated.
   //
+  // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
   // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (4, 64) step (4, 64)
   // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<4x64xf32>) {
   //
-  // The subviews should be the full 4x64 shape:
-  // CHECK-DAG:   %[[SLICE_SRC:.+]] = tensor.extract_slice {{.+}}[%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
-  // CHECK-DAG:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
-  //
-  // Thread-level forall with 64 lanes:
-  // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+  // Thread-level forall with 64 lanes (uses outer forall's shared_out directly):
+  // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[INIT_TILE]])
   // CHECK:     scf.forall.in_parallel {
-  // CHECK:       iree_gpu.coalesced_gather_dma {{.+}} into %{{.+}} lane(%[[LANE]]) in_bounds [false, true]
+  // CHECK:       iree_gpu.coalesced_gather_dma %[[EXTRACTED]] into %[[INNER_INIT]] lane(%[[LANE]]) in_bounds [false, true]
   // CHECK-SAME:     : tensor<?x64xf32>, tensor<4x64xf32>, index
   // CHECK:     }
   // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
   //
   // CHECK:   scf.forall.in_parallel {
-  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][0, 0] [4, 64] [1, 1]
   // CHECK:   }
   // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
   // CHECK-NOT: tensor.pad