[Codegen] Allow pre-padding other dims of a conv except the input channel (iree-org#22296)

yzhang93 · weidel-p · commit 8bf5bfa3bc85 · 2025-10-21T02:43:28.000-07:00
Previous PR disabled padding for all conv dimensions when input channel size is much smaller than the padding size. However, for backward conv CHWN layout, when batch and input channel dimensions are both unaligned, it is still useful to pad the batch dimension. This PR fixed iree-org#22277. --------- Signed-off-by: yzhang93 <zhyuhang88@gmail.com> Signed-off-by: Philipp <philipp.weidel@intel.com>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -369,7 +369,7 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
 }
 
 struct ConvToIgemmInfo {
-  bool isInputChannelLast;
+  bool isBatchDimLast;
   bool isSpatialDimLast;
   linalg::ConvolutionDimensions convDims;
   DenseMap<int64_t, AffineExpr> convToIgemmDimMap;
@@ -392,14 +392,28 @@ getPaddingConvSizes(Builder &b, const SmallVector<int64_t> &bounds,
 
   DenseMap<int64_t, AffineExpr> convToIgemmMap =
       convToIgemmInfo->convToIgemmDimMap;
-  // Padding sizes for parallel dimensions are the same as workgroup tile
-  // sizes.
   DenseSet<int64_t> paddedIGEMMDims;
   DenseMap<int64_t, SmallVector<int64_t>> paddedReductionConvDims;
   linalg::ConvolutionDimensions convDims = convToIgemmInfo->convDims;
   SetVector<int64_t> inputChannelDims(convDims.inputChannel.begin(),
                                       convDims.inputChannel.end());
   SmallVector<int64_t> paddingConvSizes(convToIgemmMap.size(), 0);
+
+  // For batch-last layout (e.g., CHWN), only pad the batch dimension to avoid
+  // introducing pad op as the producer of collapse_shape op which may cause
+  // fusion problem.
+  if (convToIgemmInfo->isBatchDimLast) {
+    int64_t lastBatchDim = convDims.batch.back();
+    auto IGEMMDimExpr = cast<AffineDimExpr>(convToIgemmMap[lastBatchDim]);
+    unsigned IGEMMBatchPos = IGEMMDimExpr.getPosition();
+    if (paddingSizes[IGEMMBatchPos] &&
+        bounds[IGEMMBatchPos] % paddingSizes[IGEMMBatchPos] == 0) {
+      return std::nullopt;
+    }
+    paddingConvSizes[lastBatchDim] = paddingSizes[IGEMMBatchPos];
+    return b.getI64ArrayAttr(paddingConvSizes);
+  }
+
   for (auto [convDim, IGEMMExpr] : convToIgemmMap) {
     auto IGEMMDimExpr = cast<AffineDimExpr>(IGEMMExpr);
     unsigned IGEMMPos = IGEMMDimExpr.getPosition();
@@ -415,19 +429,21 @@ getPaddingConvSizes(Builder &b, const SmallVector<int64_t> &bounds,
       // Only pad input channel dims. If we need to pad filter dims, then we
       // would rather just do padding on the GEMM instead.
       if (inputChannelDims.contains(convDim)) {
+        // Multiple input channel dims for a single IGEMMPos is not supported.
+        if (paddedIGEMMDims.contains(IGEMMPos)) {
+          return std::nullopt;
+        }
         int64_t inputChannelSize =
             convToIgemmInfo->inputChannelDimToSize[convDim];
         bool isInputChannelSizeSmall =
             (paddingSizes[IGEMMPos] / inputChannelSize > 2);
-        // The following cases are not supported:
-        // 1) Input channel is not the innermost dimension;
-        // 2) Input channel size is too small compared to padding size;
-        // 3) Multiple input channel dims for a single IGEMMPos.
-        if (!convToIgemmInfo->isInputChannelLast || isInputChannelSizeSmall ||
-            paddedIGEMMDims.contains(IGEMMPos)) {
-          return std::nullopt;
+        // If the input channel dimension is much smaller than the padding size,
+        // skip padding along that dimension while still padding the others.
+        if (isInputChannelSizeSmall) {
+          paddingConvSizes[convDim] = 0;
+        } else {
+          paddingConvSizes[convDim] = paddingSizes[IGEMMPos];
         }
-        paddingConvSizes[convDim] = paddingSizes[IGEMMPos];
         paddedIGEMMDims.insert(IGEMMPos);
       }
       continue;
@@ -766,16 +782,14 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
       kPackFactor = std::get<2>(mmaKind.getMNKShape());
     }
     paddingTileSizes[innerKDim] *= kPackFactor;
+    attrs.emplace_back("padding", b.getI64ArrayAttr(paddingTileSizes));
 
     // Create `padding_conv` attribute when padding convolutions before IGEMM
-    // is possible, otherwise fallback to pad IGEMM.
+    // is possible.
     if (auto attr =
             getPaddingConvSizes(b, bounds, paddingTileSizes, workgroupTileSizes,
                                 reductionTileSizes, convToIgemmInfo)) {
-      attrs.emplace_back(StringAttr::get(context, "padding_conv"), *attr);
-    } else {
-      attrs.emplace_back(StringAttr::get(context, "padding"),
-                         b.getI64ArrayAttr(paddingTileSizes));
+      attrs.emplace_back("padding_conv", *attr);
     }
   }
   auto configDict = DictionaryAttr::get(context, attrs);
@@ -812,13 +826,12 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
     auto inputType = llvm::cast<ShapedType>(op->getOperands()[0].getType());
     ArrayRef<int64_t> inputShape = inputType.getShape();
     AffineMap inputMap = linalgOp.getIndexingMapsArray()[0];
-    SmallVector<int64_t> inputChannelPos;
     SmallVector<int64_t> inputImagePos;
+    SmallVector<int64_t> batchPos;
     for (auto dim : igemmGenericConvDetails->convDims.inputChannel) {
       for (auto [idx, e] : llvm::enumerate(inputMap.getResults())) {
         if (e.isFunctionOfDim(dim)) {
           convToIgemmInfo.inputChannelDimToSize[dim] = inputShape[idx];
-          inputChannelPos.push_back(idx);
         }
       }
     }
@@ -829,12 +842,19 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
         }
       }
     }
-    llvm::sort(inputChannelPos);
+    for (auto dim : igemmGenericConvDetails->convDims.batch) {
+      for (auto [idx, e] : llvm::enumerate(inputMap.getResults())) {
+        if (e.isFunctionOfDim(dim)) {
+          batchPos.push_back(idx);
+        }
+      }
+    }
     llvm::sort(inputImagePos);
-    convToIgemmInfo.isInputChannelLast =
-        inputChannelPos.back() == inputShape.size() - 1;
+    llvm::sort(batchPos);
+    convToIgemmInfo.isBatchDimLast =
+        !batchPos.empty() && batchPos.back() == inputShape.size() - 1;
     convToIgemmInfo.isSpatialDimLast =
-        inputImagePos.back() == inputShape.size() - 1;
+        !inputImagePos.empty() && inputImagePos.back() == inputShape.size() - 1;
     convToIgemmInfo.convDims = igemmGenericConvDetails->convDims;
     convToIgemmInfo.convToIgemmDimMap =
         igemmGenericConvDetails->convToIgemmDimMap;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
@@ -220,7 +220,7 @@ func.func @conv_chwn_chwf_unaligned_batch(%arg0: tensor<16x193x129x40xbf16>, %ar
 //  CHECK-SAME:     subgroup = [1, 1, 1, 1, 0]
 //  CHECK-SAME:     workgroup = [16, 1, 1, 16, 0]
 
-// PAD-CONV-GFX942:     padding_conv =  [16, 1, 1, 16, 0, 0, 0]
+// PAD-CONV-GFX942:     padding_conv =  [0, 0, 0, 16, 0, 0, 0]
 
 // -----
 
@@ -305,19 +305,19 @@ module {
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1 + d5 * 2, d2 + d6 * 2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d0)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-func.func @conv_chwn_chwf_no_pad_conv(%arg0: tensor<2x192x128x40xbf16>, %arg1: tensor<2x95x63x40xbf16>, %arg2: tensor<40x3x3x40xf32>) -> tensor<40x3x3x40xf32> {
-  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<2x192x128x40xbf16>, tensor<2x95x63x40xbf16>) outs(%arg2 : tensor<40x3x3x40xf32>) {
+func.func @conv_chwn_chwf_aligned_batch(%arg0: tensor<2x192x128x48xbf16>, %arg1: tensor<2x95x63x40xbf16>, %arg2: tensor<40x3x3x48xf32>) -> tensor<40x3x3x48xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<2x192x128x48xbf16>, tensor<2x95x63x40xbf16>) outs(%arg2 : tensor<40x3x3x48xf32>) {
   ^bb0(%in: bf16, %in_0: bf16, %out: f32):
     %1 = arith.extf %in : bf16 to f32
     %2 = arith.extf %in_0 : bf16 to f32
     %3 = arith.mulf %1, %2 : f32
     %4 = arith.addf %out, %3 : f32
     linalg.yield %4 : f32
-  } -> tensor<40x3x3x40xf32>
-  return %0 : tensor<40x3x3x40xf32>
+  } -> tensor<40x3x3x48xf32>
+  return %0 : tensor<40x3x3x48xf32>
 }
 
-//         CHECK-LABEL:  func.func @conv_chwn_chwf_no_pad_conv
+//         CHECK-LABEL:  func.func @conv_chwn_chwf_aligned_batch
 //     PAD-CONV-GFX942:     padding = [16, 1, 1, 16, 16]
 // PAD-CONV-GFX942-NOT:     padding_conv
 
@@ -326,7 +326,7 @@ func.func @conv_chwn_chwf_no_pad_conv(%arg0: tensor<2x192x128x40xbf16>, %arg1: t
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d3, d4, d5, d6)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-func.func @conv_nhwc_small_channel_no_pad_conv(%arg0: tensor<16x26x19x3xf16>, %arg1: tensor<287x3x3x3xf16>, %arg2: tensor<16x24x17x287xf32>) -> tensor<16x24x17x287xf32> {
+func.func @conv_nhwc_small_channel_size(%arg0: tensor<16x26x19x3xf16>, %arg1: tensor<287x3x3x3xf16>, %arg2: tensor<16x24x17x287xf32>) -> tensor<16x24x17x287xf32> {
   %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x26x19x3xf16>, tensor<287x3x3x3xf16>) outs(%arg2 : tensor<16x24x17x287xf32>) {
   ^bb0(%in: f16, %in_0: f16, %out: f32):
     %1 = arith.extf %in : f16 to f32
@@ -338,6 +338,6 @@ func.func @conv_nhwc_small_channel_no_pad_conv(%arg0: tensor<16x26x19x3xf16>, %a
   return %0 : tensor<16x24x17x287xf32>
 }
 
-//         CHECK-LABEL:  func.func @conv_nhwc_small_channel_no_pad_conv
-//     PAD-CONV-GFX942:     padding = [1, 4, 32, 64, 32]
-// PAD-CONV-GFX942-NOT:     padding_conv
+//     CHECK-LABEL:  func.func @conv_nhwc_small_channel_size
+// PAD-CONV-GFX942:     padding = [1, 4, 32, 64, 32]
+// PAD-CONV-GFX942:     padding_conv = [1, 4, 32, 64, 0, 0, 0]