iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUPadConvs.cpp
Lines changed: 79 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUPadConvs.cpp
Lines changed: 79 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Lines changed: 10 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Lines changed: 10 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pad_convs.mlir
Lines changed: 63 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pad_convs.mlir
Lines changed: 63 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
Lines changed: 6 additions & 1 deletion b/‎compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
Lines changed: 6 additions & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp
Lines changed: 5 additions & 2 deletions b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp
Lines changed: 5 additions & 2 deletions
@@ -80,6 +80,7 @@ iree_compiler_cc_library(
         "GPUMultiBuffering.cpp",
         "GPUNestedLayoutDistributionPatterns.cpp",
         "GPUPackToIntrinsics.cpp",
+        "GPUPadConvs.cpp",
         "GPUPadOperands.cpp",
         "GPUPatterns.cpp",
         "GPUPipelining.cpp",
 
@@ -73,6 +73,7 @@ iree_cc_library(
     "GPUMultiBuffering.cpp"
     "GPUNestedLayoutDistributionPatterns.cpp"
     "GPUPackToIntrinsics.cpp"
+    "GPUPadConvs.cpp"
     "GPUPadOperands.cpp"
     "GPUPatterns.cpp"
     "GPUPipelining.cpp"
 
@@ -0,0 +1,79 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUPADCONVSPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+static LogicalResult padToStaticSizes(RewriterBase &rewriter,
+                                      TilingInterface tilingInterfaceOp,
+                                      SmallVector<OpFoldResult> paddingSizes) {
+  SmallVector<Attribute> paddingValues;
+  for (Value operand : tilingInterfaceOp.getOperation()->getOperands()) {
+    paddingValues.push_back(
+        rewriter.getZeroAttr(getElementTypeOrSelf(operand.getType())));
+  }
+
+  auto options = linalg::PadTilingInterfaceOptions()
+                     .setPaddingSizes(paddingSizes)
+                     .setPaddingValues(paddingValues)
+                     .setPadToMultipleOf(true);
+
+  SmallVector<tensor::PadOp> padOps;
+  FailureOr<TilingInterface> maybePaddedOp =
+      linalg::rewriteAsPaddedOp(rewriter, tilingInterfaceOp, options, padOps);
+  if (failed(maybePaddedOp)) {
+    return tilingInterfaceOp->emitOpError("failed to pad op");
+  }
+
+  return success();
+}
+
+struct GPUPadConvsPass final : impl::GPUPadConvsPassBase<GPUPadConvsPass> {
+  void runOnOperation() override {
+    FunctionOpInterface funcOp = getOperation();
+
+    IRRewriter rewriter(funcOp);
+    funcOp.walk([&](TilingInterface op) {
+      auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
+      if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) {
+        return;
+      }
+
+      auto loweringConfig =
+          getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+      if (!loweringConfig) {
+        return;
+      }
+
+      // Get padding sizes from lowering_config.
+      std::optional<SmallVector<int64_t>> paddingSizes =
+          getPaddingList(loweringConfig, /*padConv*/ true);
+      if (!paddingSizes) {
+        return;
+      }
+
+      SmallVector<OpFoldResult> padSizes =
+          getAsIndexOpFoldResult(rewriter.getContext(), paddingSizes.value());
+      rewriter.setInsertionPoint(op);
+      if (failed(padToStaticSizes(rewriter, op, padSizes))) {
+        return signalPassFailure();
+      }
+    });
+  }
+};
+
+} // namespace
+} // namespace mlir::iree_compiler
@@ -195,6 +195,16 @@ def GPUPackToIntrinsicsPass :
   ];
 }
 
+def GPUPadConvsPass :
+    InterfacePass<"iree-codegen-gpu-pad-convs",
+                  "mlir::FunctionOpInterface"> {
+  let summary = "Pass to pad operands of a convolution with padding configuration provided.";
+  let dependentDialects = [
+    "::mlir::linalg::LinalgDialect",
+    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
+  ];
+}
+
 def GPUPadOperandsPass :
     InterfacePass<"iree-codegen-gpu-pad-operands",
                   "mlir::FunctionOpInterface"> {
 
@@ -45,6 +45,7 @@ iree_lit_test_suite(
             "gpu_nested_layout_vector_distribution_mask.mlir",
             "gpu_nested_layout_vector_distribution_multi_reduce.mlir",
             "gpu_nested_layout_vector_distribution_step.mlir",
+            "gpu_pad_convs.mlir",
             "gpu_pad_operands.mlir",
             "gpu_pipeline.mlir",
             "gpu_promote_matmul_operands.mlir",
 
@@ -41,6 +41,7 @@ iree_lit_test_suite(
     "gpu_nested_layout_vector_distribution_multi_reduce.mlir"
     "gpu_nested_layout_vector_distribution_step.mlir"
     "gpu_pack_to_instrinsics.mlir"
+    "gpu_pad_convs.mlir"
     "gpu_pad_operands.mlir"
     "gpu_pipeline.mlir"
     "gpu_promote_matmul_operands.mlir"
 
@@ -0,0 +1,63 @@
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-pad-convs))" | FileCheck %s
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d3, d4, d5, d6)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+#lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding_conv = [1, 8, 32, 32, 0, 0, 32]}>
+func.func @conv_2d_nhwc_fhwc(%arg0: tensor<16x26x19x287xf16>, %arg1: tensor<287x3x3x287xf16>, %arg2: tensor<16x24x17x287xf32>) -> tensor<16x24x17x287xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x26x19x287xf16>, tensor<287x3x3x287xf16>) outs(%arg2 : tensor<16x24x17x287xf32>) attrs = {lowering_config = #lowering_config} {
+  ^bb0(%in: f16, %in_0: f16, %out: f32):
+    %1 = arith.extf %in : f16 to f32
+    %2 = arith.extf %in_0 : f16 to f32
+    %3 = arith.mulf %1, %2 : f32
+    %4 = arith.addf %out, %3 : f32
+    linalg.yield %4 : f32
+  } -> tensor<16x24x17x287xf32>
+  return %0 : tensor<16x24x17x287xf32>
+}
+
+// CHECK-LABEL: func.func @conv_2d_nhwc_fhwc
+//  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<16x26x19x287xf16>
+//  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<287x3x3x287xf16>
+//  CHECK-SAME:   %[[C:[A-Za-z0-9]+]]: tensor<16x24x17x287xf32>
+//       CHECK:   %[[PADDED_LHS:.+]] = tensor.pad %[[A]] low[0, 0, 0, 0] high[0, 0, 15, 1]
+//       CHECK:   %[[PADDED_RHS:.+]] = tensor.pad %[[B]] low[0, 0, 0, 0] high[1, 0, 0, 1]
+//       CHECK:   %[[PADDED_INIT:.+]] = tensor.pad %[[C]] low[0, 0, 0, 0] high[0, 0, 15, 1]
+//       CHECK:   %[[PADDED_RESULT:.+]] = linalg.generic
+//  CHECK-SAME:     ins(%[[PADDED_LHS]], %[[PADDED_RHS]] : tensor<16x26x34x288xf16>, tensor<288x3x3x288xf16>)
+//  CHECK-SAME:     outs(%[[PADDED_INIT]] : tensor<16x24x32x288xf32>)
+//       CHECK:   %[[EXTRACT:.+]] = tensor.extract_slice %[[PADDED_RESULT]][0, 0, 0, 0] [16, 24, 17, 287] [1, 1, 1, 1]
+//  CHECK-SAME:     : tensor<16x24x32x288xf32> to tensor<16x24x17x287xf32>
+//       CHECK:   return %[[EXTRACT]] : tensor<16x24x17x287xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1 + d5 * 2, d2 + d6 * 2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+#lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_BF16>, padding_conv = [16, 1, 1, 16, 0, 0, 0]}>
+func.func @conv_2d_chwn_chwf(%arg0: tensor<16x193x129x40xbf16>, %arg1: tensor<16x96x64x40xbf16>, %arg2: tensor<40x3x3x40xf32>) -> tensor<40x3x3x40xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x193x129x40xbf16>, tensor<16x96x64x40xbf16>) outs(%arg2 : tensor<40x3x3x40xf32>) attrs =  {lowering_config = #lowering_config} {
+  ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+    %1 = arith.extf %in : bf16 to f32
+    %2 = arith.extf %in_0 : bf16 to f32
+    %3 = arith.mulf %1, %2 : f32
+    %4 = arith.addf %out, %3 : f32
+    linalg.yield %4 : f32
+  } -> tensor<40x3x3x40xf32>
+  return %0 : tensor<40x3x3x40xf32>
+}
+
+// CHECK-LABEL: func.func @conv_2d_chwn_chwf
+//  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<16x193x129x40xbf16>
+//  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<16x96x64x40xbf16>
+//  CHECK-SAME:   %[[C:[A-Za-z0-9]+]]: tensor<40x3x3x40xf32>
+//       CHECK:   %[[PADDED_LHS:.+]] = tensor.pad %[[A]] low[0, 0, 0, 0] high[0, 0, 0, 8]
+//       CHECK:   %[[PADDED_RHS:.+]] = tensor.pad %[[B]] low[0, 0, 0, 0] high[0, 0, 0, 8]
+//       CHECK:   %[[PADDED_INIT:.+]] = tensor.pad %[[C]] low[0, 0, 0, 0] high[8, 0, 0, 8]
+//       CHECK:   %[[PADDED_RESULT:.+]] = linalg.generic
+//  CHECK-SAME:     ins(%[[PADDED_LHS]], %[[PADDED_RHS]] : tensor<16x193x129x48xbf16>, tensor<16x96x64x48xbf16>)
+//  CHECK-SAME:     outs(%[[PADDED_INIT]] : tensor<48x3x3x48xf32>)
+//       CHECK:   %[[EXTRACT:.+]] = tensor.extract_slice %[[PADDED_RESULT]][0, 0, 0, 0] [40, 3, 3, 40] [1, 1, 1, 1]
+//  CHECK-SAME:     : tensor<48x3x3x48xf32> to tensor<40x3x3x40xf32>
+//       CHECK:   return %[[EXTRACT]] : tensor<40x3x3x40xf32>
@@ -363,6 +363,7 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
   // Cleanup patterns for tile and distribute
   {
     RewritePatternSet patterns(context);
+    populateSwapExtractWithCollapsePattern(patterns);
     linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
     tensor::populateFoldTensorEmptyPatterns(patterns);
     context->getOrLoadDialect<tensor::TensorDialect>()
 
@@ -516,9 +516,14 @@ swapCollapseShapeWithSlice(RewriterBase &rewriter,
                                              "collapsed size must be static");
         }
 
+        // Compose all nested affine.apply chains and check if the offset is
+        // multiple of collapsed size.
+        SmallVector<Value> operands(applyOp.getOperands());
+        affine::fullyComposeAffineMapAndOperands(&map, &operands);
+        map = simplifyAffineMap(map);
         if (!map.getResult(0).isMultipleOf(maybeStaticSize.value())) {
           return rewriter.notifyMatchFailure(
-              sliceOp, "collapsed size is not divisible by offset multiplier");
+              sliceOp, "offset multiplier must be multiple of collapsed size");
         }
 
         unsigned lastReassocSize = srcShape[reassocIndices.back()];
 
@@ -178,9 +178,12 @@ setPromotedOperandsList(MLIRContext *context,
 }
 
 constexpr StringLiteral kPaddingName = "padding";
+constexpr StringLiteral kPaddingConvName = "padding_conv";
 
-std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config) {
-  auto array = config.getAttributes().getAs<ArrayAttr>(kPaddingName);
+std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config,
+                                                   bool paddingConv) {
+  auto attrName = paddingConv ? kPaddingConvName : kPaddingName;
+  auto array = config.getAttributes().getAs<ArrayAttr>(attrName);
   if (!array) {
     return std::nullopt;
   }
Original file line number	Diff line number	Diff line change
`@@ -363,6 +363,7 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {`
`363`	`363`	`// Cleanup patterns for tile and distribute`
`364`	`364`	`{`
`365`	`365`	`RewritePatternSet patterns(context);`
	`366`	`+ populateSwapExtractWithCollapsePattern(patterns);`
`366`	`367`	`linalg::populateLinalgTilingCanonicalizationPatterns(patterns);`
`367`	`368`	`tensor::populateFoldTensorEmptyPatterns(patterns);`
`368`	`369`	`context->getOrLoadDialect<tensor::TensorDialect>()`
Original file line number	Diff line number	Diff line change
`@@ -516,9 +516,14 @@ swapCollapseShapeWithSlice(RewriterBase &rewriter,`
`516`	`516`	`"collapsed size must be static");`
`517`	`517`	`}`
`518`	`518`
	`519`	`+ // Compose all nested affine.apply chains and check if the offset is`
	`520`	`+ // multiple of collapsed size.`
	`521`	`+ SmallVector<Value> operands(applyOp.getOperands());`
	`522`	`+ affine::fullyComposeAffineMapAndOperands(&map, &operands);`
	`523`	`+ map = simplifyAffineMap(map);`
`519`	`524`	`if (!map.getResult(0).isMultipleOf(maybeStaticSize.value())) {`
`520`	`525`	`return rewriter.notifyMatchFailure(`
`521`		`- sliceOp, "collapsed size is not divisible by offset multiplier");`
	`526`	`+ sliceOp, "offset multiplier must be multiple of collapsed size");`
`522`	`527`	`}`
`523`	`528`
`524`	`529`	`unsigned lastReassocSize = srcShape[reassocIndices.back()];`
Original file line number	Diff line number	Diff line change
`@@ -178,9 +178,12 @@ setPromotedOperandsList(MLIRContext *context,`
`178`	`178`	`}`
`179`	`179`
`180`	`180`	`constexpr StringLiteral kPaddingName = "padding";`
	`181`	`+constexpr StringLiteral kPaddingConvName = "padding_conv";`
`181`	`182`
`182`		`-std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config) {`
`183`		`- auto array = config.getAttributes().getAs<ArrayAttr>(kPaddingName);`
	`183`	`+std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config,`
	`184`	`+ bool paddingConv) {`
	`185`	`+ auto attrName = paddingConv ? kPaddingConvName : kPaddingName;`
	`186`	`+ auto array = config.getAttributes().getAs<ArrayAttr>(attrName);`
`184`	`187`	`if (!array) {`
`185`	`188`	`return std::nullopt;`
`186`	`189`	`}`