[Codegen][IGEMM] Set convolution pre-padding as default (iree-org#21899)

yzhang93 · web-flow · commit ec3e28e0068e · 2025-09-10T12:21:44.000-07:00
As for now, all the previous large regressions caused by convolution
pre-padding has been fixed, we'll set pre-padding as default for IGEMM
path.

In addition, this PR also removes a over-strict condition in
`swapCollapseShapeWithSlice` pattern.

---------

Signed-off-by: yzhang93 &lt;zhyuhang88@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
@@ -484,11 +484,13 @@ swapCollapseShapeWithSlice(RewriterBase &rewriter,
        llvm::zip_equal(collapsedSizes, collapsedOffsets,
                        collapseShapeOp.getReassociationIndices())) {
     // CASE #1 - size and/or offset are dynamic.
+    // TODO(vivian): For some special case, running this pattern with dynamic
+    // `collapsedSize` may cause a dynamic allocation in workgroup which blocks
+    // `GPUReduceBankConflictsPass`.
     if (isa<Value>(collapsedSize) || isa<Value>(collapsedOffset)) {
       // Special case especially for collapse shape of convolution filter in
       // IGEMM, while the offset is dynamic and the size is static.
-      if (isa<Attribute>(collapsedSize) && isa<Value>(collapsedOffset) &&
-          reassocIndices.size() != 1) {
+      if (isa<Attribute>(collapsedSize) && isa<Value>(collapsedOffset)) {
         auto maybeStaticSize = getConstantIntValue(collapsedSize);
         if (!maybeStaticSize) {
           return rewriter.notifyMatchFailure(sliceOp,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -127,7 +127,7 @@ static llvm::cl::opt<bool>
 llvm::cl::opt<bool> clGPUPadConvolution(
     "iree-codegen-llvmgpu-igemm-pad-convolution",
     llvm::cl::desc("enable pre-padding for convolutions in igemm path"),
-    llvm::cl::init(false));
+    llvm::cl::init(true));
 
 static llvm::cl::opt<bool>
     clUseDirectLoad("iree-llvmgpu-use-direct-load",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -143,9 +143,9 @@ hal.executable private @main {
 //      CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
 //      CHECK-DAG:   %[[ASSUMED_B2:.+]] = memref.assume_alignment %[[B2]], 64
 //      CHECK-DAG:   %[[BUF2:.+]] = amdgpu.fat_raw_buffer_cast %[[ASSUMED_B2]]
-//      CHECK-DAG:   memref.alloc() : memref<2x1x32x18xf32, #gpu.address_space<workgroup>>
-//      CHECK-DAG:   memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
-//      CHECK-DAG:   memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
+//      CHECK-DAG:   memref.alloc() : memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
+//      CHECK-DAG:   memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
+//      CHECK-DAG:   memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
 //      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:   %[[C721:.+]] = arith.constant 721 : index
 //      CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
@@ -155,12 +155,14 @@ hal.executable private @main {
 //      CHECK-DAG:       %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<4xf16>
 //      CHECK-DAG:       %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4xf16>
 // CHECK-COUNT-1:       amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
-//          CHECK:     %[[LOOP_T:.+]] = vector.shape_cast %[[LOOP]] : vector<1x1x1x1x4x1xf32> to vector<4xf32>
+//          CHECK:     %[[LOOP_T:.+]] = vector.shape_cast %[[LOOP]] : vector<1x1x1x1x4x1xf32> to vector<4x1x1xf32>
 //          CHECK:     vector.transfer_write %[[LOOP_T]]
 // Note there is a writeback loop here that is skipped to simplify the test.
 //          CHECK:        memref.copy {{.*}}#gpu.address_space<workgroup>> to {{.*}}#amdgpu.address_space<fat_raw_buffer>
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
+// TODO(vivian): This test doesn't have reduce bank conflicts taken effect because of collapsing a dynamic allocation. Try to fix it.
+
 // -----
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [