[DispatchCreation] Add split reduction for weight backward convs (iree-org#22275)

yzhang93 · weidel-p · commit 8f76e089fb6f · 2025-10-21T02:43:29.000-07:00
Weight backward convolutions have a special CHWN layout, where the
filter sizes (corresponding to output image sizes in forward
convolutions) are typically large, while the output spatial dimensions
are small. This makes the split reduction strategy particularly
effective. This PR adds support to split these convs along the input
channel dimension.

Some experimental thresholds are applied to filter out cases that won't
benefit from splitting reduction. Particular checks include:

- When the batch and output channel sizes are large, the workload tends
to distributed across many workgroups, making split reduction little to
no effect.
- When the input spatial sizes are small while the batch and output
channel sizes are relatively larger (medium size), split reduction often
has no effect or even degrades performance.

---------

Signed-off-by: yzhang93 &lt;zhyuhang88@gmail.com&gt;
Signed-off-by: Philipp &lt;philipp.weidel@intel.com&gt;
diff --git a/compiler/src/iree/compiler/DispatchCreation/SetSplitReductionSizes.cpp b/compiler/src/iree/compiler/DispatchCreation/SetSplitReductionSizes.cpp
@@ -4,10 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtInterfaces.h"
 #include "iree/compiler/DispatchCreation/Passes.h"
-
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "llvm/Support/DebugLog.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 
@@ -94,12 +93,17 @@ struct SetSplitReductionSizesPass final
         return;
       }
 
-      std::optional<SmallVector<int64_t>> tileSizes =
-          getOuterReductionSizes(tilingOp);
-      if (!tileSizes) {
+      // --- Case 1: Outer reduction ---
+      if (auto tileSizes = getOuterReductionSizes(tilingOp)) {
+        IREE::LinalgExt::setSplitReductionAttribute(tilingOp, *tileSizes);
+        return;
+      }
+
+      // --- Case 2: Generic weight backward convolution ---
+      if (auto tileSizes = getWeightBackwardReductionSizes(tilingOp)) {
+        IREE::LinalgExt::setSplitReductionAttribute(tilingOp, *tileSizes);
         return;
       }
-      IREE::LinalgExt::setSplitReductionAttribute(tilingOp, tileSizes.value());
     });
   }
 
@@ -143,6 +147,131 @@ struct SetSplitReductionSizesPass final
     }
     return tileSizes;
   }
+
+  /// Determines split reduction sizes for weight backward convolutions.
+  /// These convolutions have a special CHWN layout, where the filter sizes
+  /// (corresponding to output image sizes in forward convolutions) are
+  /// typically large, while the output spatial dimensions are small. This makes
+  /// the split reduction strategy particularly effective. Currently, splitting
+  /// is only applied along the input channel dimension.
+  std::optional<SmallVector<int64_t>>
+  getWeightBackwardReductionSizes(PartialReductionOpInterface op) const {
+    // First check if the input op is a convolution with CHWN layout.
+    auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
+    if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) {
+      LDBG() << "skipping op; not convolution";
+      return std::nullopt;
+    }
+
+    FailureOr<mlir::linalg::ConvolutionDimensions> convDims =
+        mlir::linalg::inferConvolutionDims(linalgOp);
+    if (failed(convDims)) {
+      LDBG() << "skipping op; failed to infer convolution dims";
+      return std::nullopt;
+    }
+
+    if (convDims->inputChannel.empty() || convDims->outputChannel.empty() ||
+        convDims->batch.empty() || convDims->filterLoop.empty()) {
+      LDBG() << "skipping op; missing convolution dimensions";
+      return std::nullopt;
+    }
+
+    OpOperand *input = linalgOp.getDpsInputOperand(0);
+    OpOperand *filter = linalgOp.getDpsInputOperand(1);
+    OpOperand *output = linalgOp.getDpsInitOperand(0);
+
+    Value inputVal = input->get();
+    Value filterVal = filter->get();
+    Value outputVal = output->get();
+
+    ArrayRef<int64_t> inputShape =
+        llvm::cast<ShapedType>(inputVal.getType()).getShape();
+    ArrayRef<int64_t> filterShape =
+        llvm::cast<ShapedType>(filterVal.getType()).getShape();
+    ArrayRef<int64_t> outputShape =
+        llvm::cast<ShapedType>(outputVal.getType()).getShape();
+
+    if (ShapedType::isDynamicShape(inputShape) ||
+        ShapedType::isDynamicShape(filterShape) ||
+        ShapedType::isDynamicShape(outputShape)) {
+      LDBG() << "skipping op; has dynamic shape";
+      return std::nullopt;
+    }
+
+    AffineMap inputMap = linalgOp.getMatchingIndexingMap(input);
+    AffineMap filterMap = linalgOp.getMatchingIndexingMap(filter);
+    AffineMap outputMap = linalgOp.getMatchingIndexingMap(output);
+
+    std::optional<int64_t> batchLastDim = outputMap.getResultPosition(
+        getAffineDimExpr(convDims->batch.back(), outputMap.getContext()));
+    if (!batchLastDim || batchLastDim.value() != outputShape.size() - 1) {
+      LDBG() << "skipping op; not batch last layout";
+      return std::nullopt;
+    }
+
+    std::optional<int64_t> inputChannelDim = filterMap.getResultPosition(
+        getAffineDimExpr(convDims->inputChannel[0], filterMap.getContext()));
+    std::optional<int64_t> filterDim = filterMap.getResultPosition(
+        getAffineDimExpr(convDims->filterLoop[0], filterMap.getContext()));
+    if (!inputChannelDim || !filterDim ||
+        inputChannelDim.value() > filterDim.value()) {
+      LDBG() << "skipping op; not channel first layout";
+      return std::nullopt;
+    }
+
+    std::optional<int64_t> outputChannelDim = outputMap.getResultPosition(
+        getAffineDimExpr(convDims->outputChannel[0], outputMap.getContext()));
+    if (!outputChannelDim) {
+      LDBG() << "skipping op; has no output channel dim";
+      return std::nullopt;
+    }
+
+    std::optional<SmallVector<int64_t>> maybeSizes =
+        getReductionDimSizes(op.getOperation());
+    if (!maybeSizes) {
+      LDBG() << "skipping op; failed to get reduction sizes";
+      return std::nullopt;
+    }
+
+    // The constants below are determined based on empirical data.
+    const int64_t largeDimSize = 512;
+    const int64_t mediumDimSize = 128;
+    const int64_t smallDimSize = 32;
+
+    // When the batch and output channel sizes are large, the workload tends
+    // to distributed across many workgroups, making split reduction little to
+    // no effect.
+    int64_t outputChannelSize = outputShape[outputChannelDim.value()];
+    int64_t batchSize = outputShape[batchLastDim.value()];
+    if (outputChannelSize >= largeDimSize && batchSize >= largeDimSize) {
+      LDBG() << "skipping op; large output channel or batch size";
+      return std::nullopt;
+    }
+
+    // When the input spatial sizes are small while the batch and output channel
+    // sizes are relatively larger, split reduction often has no effect or even
+    // degrades performance.
+    for (auto dim : convDims->filterLoop) {
+      for (auto [idx, e] : llvm::enumerate(inputMap.getResults())) {
+        if (e.isFunctionOfDim(dim) && inputShape[idx] < smallDimSize &&
+            outputChannelSize > mediumDimSize && batchSize > mediumDimSize) {
+          LDBG() << "skipping op; small input spatial size";
+          return std::nullopt;
+        }
+      }
+    }
+
+    // Only split along the input channel dimension.
+    // TODO(vivian): split more reduction dimensions if needed.
+    int64_t cDim = inputChannelDim.value();
+    SmallVector<int64_t> tileSizes = std::move(*maybeSizes);
+    if (tileSizes[cDim] == 1) {
+      LDBG() << "skipping op; input channel size equals to 1";
+      return std::nullopt;
+    }
+    tileSizes[cDim] = std::ceil(float(tileSizes[cDim]) / largeDimSize);
+    return tileSizes;
+  }
 };
 } // namespace
 } // namespace mlir::iree_compiler::DispatchCreation
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/BUILD.bazel b/compiler/src/iree/compiler/DispatchCreation/test/BUILD.bazel
@@ -54,6 +54,7 @@ iree_lit_test_suite(
             "set_encoding_padding.mlir",
             "set_encoding_pipeline.mlir",
             "set_split_reduction_sizes.mlir",
+            "set_split_reduction_sizes_conv.mlir",
             "sink_reshapes.mlir",
             "split_reduction.mlir",
             "tensor_pad_to_tensor_insert_slice.mlir",
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/CMakeLists.txt b/compiler/src/iree/compiler/DispatchCreation/test/CMakeLists.txt
@@ -52,6 +52,7 @@ iree_lit_test_suite(
     "set_encoding_padding.mlir"
     "set_encoding_pipeline.mlir"
     "set_split_reduction_sizes.mlir"
+    "set_split_reduction_sizes_conv.mlir"
     "sink_reshapes.mlir"
     "split_reduction.mlir"
     "tensor_pad_to_tensor_insert_slice.mlir"
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/set_split_reduction_sizes_conv.mlir b/compiler/src/iree/compiler/DispatchCreation/test/set_split_reduction_sizes_conv.mlir
@@ -0,0 +1,72 @@
+// RUN: iree-opt %s --pass-pipeline="builtin.module(util.func(iree-dispatch-creation-set-split-reduction-sizes))" --split-input-file > %t
+// RUN: FileCheck %s < %t
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1 + d5, d2 + d6, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+util.func public @conv_2d_chwn_chwf(%arg0: tensor<16x227x227x16xf32>, %arg1: tensor<16x225x225x64xf32>, %arg2: tensor<64x3x3x16xf32>) -> tensor<64x3x3x16xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x227x227x16xf32>, tensor<16x225x225x64xf32>) outs(%arg2 : tensor<64x3x3x16xf32>) {
+  ^bb0(%in: f32, %in_3: f32, %out: f32):
+    %12 = arith.mulf %in, %in_3 : f32
+    %13 = arith.addf %out, %12 : f32
+    linalg.yield %13 : f32
+  } -> tensor<64x3x3x16xf32>
+  util.return %0 : tensor<64x3x3x16xf32>
+}
+
+// CHECK-LABEL: @conv_2d_chwn_chwf
+//       CHECK: iree_linalg_ext.split_reduction = [1 : index, 225 : index, 225 : index]
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d3, d4, d5, d6)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+util.func public @no_split_conv_2d_nhwc_fhwc(%arg0: tensor<16x227x227x16xf32>, %arg1: tensor<64x3x3x16xf32>, %arg2: tensor<16x225x225x64xf32>) -> tensor<16x225x225x64xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x227x227x16xf32>, tensor<64x3x3x16xf32>) outs(%arg2 : tensor<16x225x225x64xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %3 = arith.mulf %in, %in_0 : f32
+    %4 = arith.addf %out, %3 : f32
+    linalg.yield %4 : f32
+  } -> tensor<16x225x225x64xf32>
+  util.return %0 : tensor<16x225x225x64xf32>
+}
+
+// CHECK-LABEL: @no_split_conv_2d_nhwc_fhwc
+//   CHECK-NOT: iree_linalg_ext.split_reduction
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1 + d5, d2 + d6, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+util.func public @no_split_large_N_F_sizes(%arg0: tensor<16x98x50x1024xf32>, %arg1: tensor<16x96x48x1024xf32>, %arg2: tensor<1024x3x3x1024xf32>) -> tensor<1024x3x3x1024xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x98x50x1024xf32>, tensor<16x96x48x1024xf32>) outs(%arg2 : tensor<1024x3x3x1024xf32>) {
+  ^bb0(%in: f32, %in_3: f32, %out: f32):
+    %12 = arith.mulf %in, %in_3 : f32
+    %13 = arith.addf %out, %12 : f32
+    linalg.yield %13 : f32
+  } -> tensor<1024x3x3x1024xf32>
+  util.return %0 : tensor<1024x3x3x1024xf32>
+}
+
+// CHECK-LABEL:  @no_split_large_N_F_sizes
+//   CHECK-NOT:  iree_linalg_ext.split_reduction
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1 + d5, d2 + d6, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+util.func public @no_split_small_H_W_sizes(%arg0: tensor<16x26x18x288xf32>, %arg1: tensor<16x24x16x288xf32>, %arg2: tensor<288x3x3x288xf32>) -> tensor<288x3x3x288xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<16x26x18x288xf32>, tensor<16x24x16x288xf32>) outs(%arg2 : tensor<288x3x3x288xf32>) {
+  ^bb0(%in: f32, %in_3: f32, %out: f32):
+    %12 = arith.mulf %in, %in_3 : f32
+    %13 = arith.addf %out, %12 : f32
+    linalg.yield %13 : f32
+  } -> tensor<288x3x3x288xf32>
+  util.return %0 : tensor<288x3x3x288xf32>
+}
+
+// CHECK-LABEL:  @no_split_small_H_W_sizes
+//   CHECK-NOT:  iree_linalg_ext.split_reduction