Add default option to only do loop fission for unit trip loops (#21069)

nirvedhmeshram · web-flow · commit ca25934cf85d · 2025-06-10T15:19:40.000-05:00
This option is useful since we don't get good performance when we
fission multi-trip loops. Additionally the prefetching pass that makes
use of fission is not set up to support multi-trip nested loops. Note
that even if one candidate is found to be multi-trip we don't do the
whole pass as we wont be doing prefetching in that case and hence no
point in fissioning at all.

---------

Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/FissionTransferOpsInControlFlow.cpp b/compiler/src/iree/compiler/Codegen/Common/FissionTransferOpsInControlFlow.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -254,6 +255,9 @@ static FailureOr<FissionTarget> populateFissionTarget(scf::ForOp forOp) {
 struct FissionTransferOpsInControlFlowPass final
     : impl::FissionTransferOpsInControlFlowPassBase<
           FissionTransferOpsInControlFlowPass> {
+  using impl::FissionTransferOpsInControlFlowPassBase<
+      FissionTransferOpsInControlFlowPass>::
+      FissionTransferOpsInControlFlowPassBase;
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
     IRRewriter rewriter(funcOp.getContext());
@@ -267,6 +271,12 @@ struct FissionTransferOpsInControlFlowPass final
       if (failed(result)) {
         continue;
       }
+      // When not doing multi-trip fission if we have even one multi-trip loop
+      // we bail-out from this pass and dont do fission as we wont be doing any
+      // prefetching which is the point of doing fission.
+      if (!FissionMultiTrip && !neverRunsSecondIteration(forOp)) {
+        return;
+      }
       fissionTargets.push_back(result.value());
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -355,6 +355,11 @@ def FissionTransferOpsInControlFlowPass : InterfacePass<"iree-codegen-fission-tr
   let dependentDialects = [
       "memref::MemRefDialect"
   ];
+  let options = [
+    Option<"FissionMultiTrip", "fission-multi-trip",
+           "bool", /*default=*/"false",
+           "Allow fission in presence of loops with greater than one trip count.">
+  ];
 }
 
 def FlattenMemRefSubspanPass : Pass<"iree-codegen-flatten-memref-subspan", "ModuleOp"> {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/fission_transfer_ops_control_flow.mlir b/compiler/src/iree/compiler/Codegen/Common/test/fission_transfer_ops_control_flow.mlir
@@ -1,11 +1,12 @@
-// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow),cse,canonicalize)" %s | FileCheck %s
+// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow{fission-multi-trip}),cse,canonicalize)" %s | FileCheck %s --check-prefixes=CHECK-ALL,MULTI
+// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow),cse)" %s | FileCheck %s --check-prefixes=CHECK-ALL,SINGLE
 
-// CHECK-LABEL: @fission_global_read_to_private_write
-// CHECK-SAME: %[[ARG0:.*]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>
-// CHECK-SAME: %[[ARG1:.*]]: index
-// CHECK-SAME: %[[ARG2:.*]]: i1
-// CHECK-SAME: %[[ARG3:.*]]: vector<1x1x1x8xbf16>
-// CHECK-SAME: %[[ARG4:.*]]: memref<1x1x1x8xbf16, #gpu.address_space<private>>
+// CHECK-ALL-LABEL: @fission_global_read_to_private_write
+// CHECK-ALL-SAME: %[[ARG0:.*]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>
+// CHECK-ALL-SAME: %[[ARG1:.*]]: index
+// CHECK-ALL-SAME: %[[ARG2:.*]]: i1
+// CHECK-ALL-SAME: %[[ARG3:.*]]: vector<1x1x1x8xbf16>
+// CHECK-ALL-SAME: %[[ARG4:.*]]: memref<1x1x1x8xbf16, #gpu.address_space<private>>
 func.func @fission_global_read_to_private_write(%arg0: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %arg1: index, %arg2: i1, %arg3: vector<1x1x1x8xbf16>, %arg4: memref<1x1x1x8xbf16, #gpu.address_space<private>>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -17,23 +18,26 @@ func.func @fission_global_read_to_private_write(%arg0: memref<1x?x?x8xbf16, #amd
   }
   return
 }
-// CHECK: %[[ALLOCA:.*]] = memref.alloca(%[[ARG1]])
-// CHECK: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
-// CHECK:   %[[read:.*]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
-// CHECK:   vector.transfer_write %[[read]], %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
-// CHECK: }
-// CHECK: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
-// CHECK:   %[[read:.*]] = vector.transfer_read %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
-// CHECK:   %[[select:.*]] = arith.select %[[ARG2]], %[[read]], %[[ARG3]]
-// CHECK:   vector.transfer_write %[[select]], %arg4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
-// CHECK: }
+// MULTI: %[[ALLOCA:.*]] = memref.alloca(%[[ARG1]])
+// MULTI: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
+// MULTI:   %[[read:.*]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
+// MULTI:   vector.transfer_write %[[read]], %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
+// MULTI: }
+// MULTI: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
+// MULTI:   %[[read:.*]] = vector.transfer_read %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
+// MULTI:   %[[select:.*]] = arith.select %[[ARG2]], %[[read]], %[[ARG3]]
+// MULTI:   vector.transfer_write %[[select]], %arg4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
+// MULTI: }
+
+// SINGLE: scf.for
+// SINGLE-NOT: scf.for
 
 // -----
 
-// CHECK-LABEL: @fission_global_read_to_workgroup_write
-// CHECK-SAME: %[[ARG0:.*]]: index
-// CHECK-SAME: %[[ARG1:.*]]: memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
-// CHECK-SAME: %[[ARG2:.*]]: memref<1x4xf32, #gpu.address_space<workgroup>>
+// CHECK-ALL-LABEL: @fission_global_read_to_workgroup_write
+// CHECK-ALL-SAME: %[[ARG0:.*]]: index
+// CHECK-ALL-SAME: %[[ARG1:.*]]: memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
+// CHECK-ALL-SAME: %[[ARG2:.*]]: memref<1x4xf32, #gpu.address_space<workgroup>>
 func.func @fission_global_read_to_workgroup_write(%arg0: index, %arg1: memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>, %arg2: memref<1x4xf32, #gpu.address_space<workgroup>>) {
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index
@@ -45,28 +49,31 @@ func.func @fission_global_read_to_workgroup_write(%arg0: index, %arg1: memref<?x
   }
   return
 }
-// CHECK: %[[SUB:.*]] = arith.subi %c16, %[[ARG0]]
-// CHECK: %[[DIV:.*]] = arith.ceildivui %[[SUB]], %c128
-// CHECK: %[[ALLOCA:.*]] = memref.alloca(%[[DIV]])
-// CHECK: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
-// CHECK:   %[[READ:.*]] = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]}
-// CHECK:   %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
-// CHECK:   %[[DIV:.*]] = arith.divui %[[SUB]], %c128
-// CHECK:   vector.transfer_write %[[READ]], %[[ALLOCA]][%[[DIV]], %c0, %c0] {in_bounds = [true, true]}
-// CHECK: }
-// CHECK: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
-// CHECK:   %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
-// CHECK:   %[[DIV:.*]] = arith.divui %[[SUB]], %c128
-// CHECK:   %[[READ:.*]] = vector.transfer_read %[[ALLOCA]][%[[DIV]], %c0, %c0], %cst {in_bounds = [true, true]}
-// CHECK:   vector.transfer_write %[[READ]], %arg2[%c0, %c0] {in_bounds = [true, true]}
-// CHECK: }
+// MULTI: %[[SUB:.*]] = arith.subi %c16, %[[ARG0]]
+// MULTI: %[[DIV:.*]] = arith.ceildivui %[[SUB]], %c128
+// MULTI: %[[ALLOCA:.*]] = memref.alloca(%[[DIV]])
+// MULTI: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
+// MULTI:   %[[READ:.*]] = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]}
+// MULTI:   %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
+// MULTI:   %[[DIV:.*]] = arith.divui %[[SUB]], %c128
+// MULTI:   vector.transfer_write %[[READ]], %[[ALLOCA]][%[[DIV]], %c0, %c0] {in_bounds = [true, true]}
+// MULTI: }
+// MULTI: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
+// MULTI:   %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
+// MULTI:   %[[DIV:.*]] = arith.divui %[[SUB]], %c128
+// MULTI:   %[[READ:.*]] = vector.transfer_read %[[ALLOCA]][%[[DIV]], %c0, %c0], %cst {in_bounds = [true, true]}
+// MULTI:   vector.transfer_write %[[READ]], %arg2[%c0, %c0] {in_bounds = [true, true]}
+// MULTI: }
+
+// SINGLE: scf.for
+// SINGLE-NOT: scf.for
 
 // -----
 
-// CHECK-LABEL: @no_fission_global_read_to_global_write
-// CHECK-SAME: %[[ARG0:.*]]: memref<1x?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
-// CHECK-SAME: %[[ARG1:.*]]: memref<1x?x?xf32, #gpu.address_space<global>>
-// CHECK-SAME: %[[ARG2:.*]]: index
+// CHECK-ALL-LABEL: @no_fission_global_read_to_global_write
+// CHECK-ALL-SAME: %[[ARG0:.*]]: memref<1x?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
+// CHECK-ALL-SAME: %[[ARG1:.*]]: memref<1x?x?xf32, #gpu.address_space<global>>
+// CHECK-ALL-SAME: %[[ARG2:.*]]: index
 func.func @no_fission_global_read_to_global_write(%arg0: memref<1x?x?xf32, #amdgpu.address_space<fat_raw_buffer>>, %arg1: memref<1x?x?xf32, #gpu.address_space<global>>, %arg2: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -77,8 +84,28 @@ func.func @no_fission_global_read_to_global_write(%arg0: memref<1x?x?xf32, #amdg
   }
   return
 }
-// CHECK: scf.for %[[ITER:.*]] = %c0 to %[[ARG2]] step %c1 {
-// CHECK:   %[[READ:.*]] = vector.transfer_read
-// CHECK:   vector.transfer_write %[[READ]], %arg1[%[[ITER]], %c0, %c0] {in_bounds = [true, true, true]}
-// CHECK: }
-// CHECK-NOT: scf.for
+// MULTI: scf.for %[[ITER:.*]] = %c0 to %[[ARG2]] step %c1 {
+// MULTI:   %[[READ:.*]] = vector.transfer_read
+// MULTI:   vector.transfer_write %[[READ]], %arg1[%[[ITER]], %c0, %c0] {in_bounds = [true, true, true]}
+// MULTI: }
+// MULTI-NOT: scf.for
+
+// SINGLE: scf.for
+// SINGLE-NOT: scf.for
+
+// -----
+
+// CHECK-ALL-LABEL: @fission_unit_trip
+func.func @fission_unit_trip(%arg0: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %arg1: index, %arg2: i1, %arg3: vector<1x1x1x8xbf16>, %arg4: memref<1x1x1x8xbf16, #gpu.address_space<private>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : bf16
+  %ub = affine.min affine_map<(d0) -> (1, d0)>(%arg1)
+  scf.for %arg5 = %c0 to %ub step %c1 {
+    %read = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, vector<1x1x1x8xbf16>
+    %select = arith.select %arg2, %read, %arg3 : vector<1x1x1x8xbf16>
+    vector.transfer_write %select, %arg4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xbf16>, memref<1x1x1x8xbf16, #gpu.address_space<private>>
+  }
+  return
+}
+// CHECK-ALL-COUNT-2: scf.for
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp b/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
@@ -60,36 +61,6 @@ static void replaceForWithIf(PatternRewriter &rewriter, scf::ForOp op,
   rewriter.replaceOp(op, ifOp);
 }
 
-/// Return true if we can prove that the we always run at least the first
-/// iteration of the ForOp.
-static bool alwaysRunsFirstIteration(scf::ForOp op) {
-  // Can't perform the analysis if the loops's bounds aren't index-typed.
-  if (!op.getInductionVar().getType().isIndex())
-    return false;
-  FailureOr<bool> isLb = ValueBoundsConstraintSet::compare(
-      getAsOpFoldResult(op.getLowerBound()), ValueBoundsConstraintSet::LT,
-      getAsOpFoldResult(op.getUpperBound()));
-  return isLb.value_or(false);
-}
-
-/// Return true if we can prove that the we never run more than one iteration of
-/// the ForOp.
-static bool neverRunsSecondIteration(scf::ForOp op) {
-  // Can't perform the analysis if the loops's bounds aren't index-typed.
-  if (!op.getInductionVar().getType().isIndex())
-    return false;
-  // If the upper bound (ub) is less than or equal to the loop step, then
-  // lower bound  + step must be greater than the upper bound, assuming the
-  // lower bound is non-negative.
-  FailureOr<bool> isUbUnderStep = ValueBoundsConstraintSet::compare(
-      getAsOpFoldResult(op.getUpperBound()), ValueBoundsConstraintSet::LE,
-      getAsOpFoldResult(op.getStep()));
-  FailureOr<bool> isLbNonNegative = ValueBoundsConstraintSet::compare(
-      getAsOpFoldResult(op.getLowerBound()), ValueBoundsConstraintSet::GE,
-      getAsIndexOpFoldResult(op.getContext(), 0));
-  return isUbUnderStep.value_or(false) && isLbNonNegative.value_or(false);
-}
-
 namespace {
 /// Rewriting pattern that replaces single-iteration loops with their bodies.
 struct SimplifyTrivialLoops : public OpRewritePattern<scf::ForOp> {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -1903,4 +1903,30 @@ std::optional<int64_t> getConstantIndex(Value value) {
   return val.getSExtValue();
 }
 
+bool alwaysRunsFirstIteration(scf::ForOp op) {
+  // Can't perform the analysis if the loops's bounds aren't index-typed.
+  if (!op.getInductionVar().getType().isIndex())
+    return false;
+  FailureOr<bool> isLb = ValueBoundsConstraintSet::compare(
+      getAsOpFoldResult(op.getLowerBound()), ValueBoundsConstraintSet::LT,
+      getAsOpFoldResult(op.getUpperBound()));
+  return isLb.value_or(false);
+}
+
+bool neverRunsSecondIteration(scf::ForOp op) {
+  // Can't perform the analysis if the loops's bounds aren't index-typed.
+  if (!op.getInductionVar().getType().isIndex())
+    return false;
+  // If the upper bound (ub) is less than or equal to the loop step, then
+  // lower bound  + step must be greater than the upper bound, assuming the
+  // lower bound is non-negative.
+  FailureOr<bool> isUbUnderStep = ValueBoundsConstraintSet::compare(
+      getAsOpFoldResult(op.getUpperBound()), ValueBoundsConstraintSet::LE,
+      getAsOpFoldResult(op.getStep()));
+  FailureOr<bool> isLbNonNegative = ValueBoundsConstraintSet::compare(
+      getAsOpFoldResult(op.getLowerBound()), ValueBoundsConstraintSet::GE,
+      getAsIndexOpFoldResult(op.getContext(), 0));
+  return isUbUnderStep.value_or(false) && isLbNonNegative.value_or(false);
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -322,6 +322,14 @@ inferSizesFromIR(linalg::LinalgOp linalgOp, std::optional<OpResult> opResult);
 /// Returns the underlying index if the given value is a constant index.
 std::optional<int64_t> getConstantIndex(Value value);
 
+/// Return true if we can prove that the we always run at least the first
+/// iteration of the ForOp.
+bool alwaysRunsFirstIteration(scf::ForOp op);
+
+/// Return true if we can prove that the we never run more than one iteration of
+/// the ForOp.
+bool neverRunsSecondIteration(scf::ForOp op);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_UTILS_UTILS_H_