[Global Opt] Prevent expanding reduction dims (iree-org#20290)

IanWood1 · web-flow · commit e122dcc045ef · 2025-03-18T09:51:04.000-07:00
Prevents fusing reshapes with reduction ops. Fixes llama fp8 perf regression due to interleaved parallel/reduction dimensions after llvm integrate (iree-org/llvm-project@813bbe0). Adding `memref::populateResolveRankedShapedTypeResultDimsPatterns` is unrelated, but I noticed some dynamic dimensions that weren't getting simplified during this pass. --------- Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
diff --git a/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp b/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -1001,6 +1002,7 @@ populateCommonCanonicalizationPatterns(MLIRContext *context,
   tensor::EmptyOp::getCanonicalizationPatterns(patterns, context);
   tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, context);
   tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, context);
+  memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
   tensor::populateFoldTensorEmptyPatterns(patterns,
                                           /*foldSingleUseOnly=*/false);
 }
@@ -1140,7 +1142,7 @@ void PropagateLinalgTransposePass::runOnOperation() {
             return false;
           }
           auto consumerLinalgOp = dyn_cast<linalg::LinalgOp>(consumer);
-          if (!consumerLinalgOp) {
+          if (!consumerLinalgOp || consumerLinalgOp.getNumReductionLoops()) {
             return false;
           }
           // Only reshape generic ops.
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
@@ -733,3 +733,22 @@ util.func public @bubble_transpose_v_from_attention(%q: tensor<2x10x4096x64xf16>
 // CHECK-SAME:    ins(%[[ARG0]], %[[ARG1]], %[[TRANS_V]], %[[ARG5]] : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16)
 // CHECK-SAME:    outs(%[[EMPTY]] : tensor<2x10x4096x64xf16>)
 // CHECK:         util.return %[[ATTN]] : tensor<2x10x4096x64xf16>
+
+// -----
+
+util.func public @dont_reshape_reduction(%arg0: tensor<16x4x4xf32>, %arg1: tensor<16x16xf32>) -> tensor<16x16xf32> {
+  %empty1 = tensor.empty(): tensor<16x4x4xf32>
+  %0 = linalg.transpose ins(%arg0 : tensor<16x4x4xf32>)
+      outs(%empty1 : tensor<16x4x4xf32>) permutation = [0, 2, 1]
+  %collapse = tensor.collapse_shape %0 [[0], [1, 2]] : tensor<16x4x4xf32> into tensor<16x16xf32>
+  %empty2 = tensor.empty(): tensor<16x16xf32>
+  %1 = linalg.matmul ins(%collapse, %arg1: tensor<16x16xf32>, tensor<16x16xf32>)
+                            outs(%empty2 : tensor<16x16xf32>) -> tensor<16x16xf32>
+
+  util.return %1 : tensor<16x16xf32>
+}
+// APROP-LABEL: util.func public @dont_reshape_reduction
+//       APROP:   %[[V0:.+]] = linalg.transpose
+//       APROP:   %[[V1:.+]] = tensor.collapse_shape %[[V0]]
+//       APROP:   %[[V2:.+]] = linalg.matmul ins(%[[V1]]
+//       APROP:   util.return %[[V2]]