Add support for dynamic unit trip scf.for to scf.if (#20880)

nirvedhmeshram · web-flow · commit 26e6e976ff70 · 2025-05-27T13:12:40.000-05:00
This PR adds support for dynamic unit trip (0 or 1 trip) scf.for using scf.if.

---------

Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/remove_single_iteration_loop.mlir b/compiler/src/iree/compiler/Codegen/Common/test/remove_single_iteration_loop.mlir
@@ -16,11 +16,14 @@ func.func @thread_tile_loop() {
        gpu.barrier
     }
   }
-  // The inner loop doesn't always execute once so it cannot be removed.
-  //     CHECK: scf.for %{{.*}} = %{{.*}} to %[[C250]] step %[[C250]]
-  //     CHECK:   gpu.barrier
   scf.for %arg3 = %tidy to %c2 step %c2 {
+  //  CHECK-NOT:  scf.for
     %0 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%tidx]
+  //      CHECK:   %[[LB:.+]] = affine.apply
+  // The inner loop doesn't always execute once so it needs an scf.if
+  //      CHECK: %[[COND:.+]] = arith.cmpi slt, %[[LB]], %[[C250]] : index
+  //      CHECK: scf.if %[[COND]] {
+  //      CHECK:   gpu.barrier
     scf.for %arg4 = %0 to %c250 step %c250 {
        gpu.barrier
     }
@@ -161,6 +164,7 @@ func.func @delinearize_linearize() {
   %c64 = arith.constant 64 : index
   %tidx = gpu.thread_id x upper_bound 128
   %ids:2 = affine.delinearize_index %tidx into (4, 32) : index, index
+  //     CHECK: %[[IDS:.+]]:2 = affine.delinearize_index
   // CHECK-NOT: scf.for
   //     CHECK: gpu.barrier
   scf.for %arg3 = %ids#0 to %c4 step %c4 {
@@ -169,8 +173,9 @@ func.func @delinearize_linearize() {
        gpu.barrier
     }
   }
-  // The loop loop doesn't always execute once so it cannot be removed.
-  //     CHECK: scf.for %{{.*}} = %{{.*}} to %[[C3]] step %{{.*}}
+  // The loop doesn't always execute once so it needs an scf.if
+  //     CHECK: %[[COND:.+]] = arith.cmpi slt, %[[IDS:.+]]#0, %[[C3]] : index
+  //     CHECK: scf.if %[[COND]] {
   //     CHECK:   gpu.barrier
   scf.for %arg3 = %ids#0 to %c3 step %c4 {
     gpu.barrier
@@ -220,3 +225,91 @@ func.func @argument_with_assume(%arg_index : index) {
   }
   return
 }
+
+// -----
+
+func.func @dynamic_ub_unittrip(%arg_index : index, %arg_value : memref<8xf16>) {
+  %c1 = arith.constant 0 : index
+  %c3 = arith.constant 3 : index
+  %0 = util.assume.int %arg_index<umin = 0, umax = 3> : index
+  scf.for %arg1 = %c1 to %0 step %c3 {
+    %alloc = memref.alloc() : memref<4xf16>
+    %subview = memref.subview %arg_value[%arg1][4][1] :  memref<8xf16> to memref<4xf16, strided<[1], offset: ?>>
+    memref.copy %alloc, %subview : memref<4xf16> to memref<4xf16, strided<[1], offset: ?>>
+  }
+  return
+}
+// CHECK-LABEL: func.func @dynamic_ub_unittrip
+//  CHECK-SAME: (%[[ARGINDEX:.+]]: index, %[[ARGVALUE:.+]]: memref<8xf16>)
+//       CHECK: %[[C0:.+]] = arith.constant 0 : index
+//       CHECK: %[[UB:.+]] = util.assume.int %[[ARGINDEX]]
+//       CHECK: %[[COND:.+]] = arith.cmpi sgt, %[[UB]], %[[C0]] : index
+//       CHECK: scf.if %[[COND]] {
+//       CHECK:   memref.alloc()
+//       CHECK:   memref.subview %[[ARGVALUE]][%[[C0]]] [4] [1]
+//       CHECK:   memref.copy
+
+// -----
+
+func.func @dynamic_lb_unittrip(%arg_index : index, %arg_value : memref<8xf16>) {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %0 = util.assume.int %arg_index<umin = 0, umax = 50> : index
+  scf.for %arg1 = %0 to %c3 step %c3 {
+    %alloc = memref.alloc() : memref<4xf16>
+    %subview = memref.subview %arg_value[%arg1][4][1] :  memref<8xf16> to memref<4xf16, strided<[1], offset: ?>>
+    memref.copy %alloc, %subview : memref<4xf16> to memref<4xf16, strided<[1], offset: ?>>
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @dynamic_lb_unittrip
+//  CHECK-SAME: (%[[ARGINDEX:.+]]: index, %[[ARGVALUE:.+]]: memref<8xf16>)
+//       CHECK: %[[C3:.+]] = arith.constant 3 : index
+//       CHECK: %[[LB:.+]] = util.assume.int %[[ARGINDEX]]
+//       CHECK: %[[COND:.+]] = arith.cmpi slt, %[[LB]], %[[C3]] : index
+//       CHECK: scf.if %[[COND]] {
+//       CHECK:   memref.alloc()
+//       CHECK:   memref.subview %[[ARGVALUE]][%[[LB]]] [4] [1]
+//       CHECK:   memref.copy
+
+// -----
+
+func.func @dynamic_nonunittrip(%arg_index : index, %arg_value : memref<8xf16>) {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %0 = util.assume.int %arg_index<umin = 0, umax = 5> : index
+  scf.for %arg1 = %c1 to %0 step %c3 {
+       gpu.barrier
+  }
+  return
+}
+// CHECK-LABEL: func.func @dynamic_nonunittrip
+//       CHECK: scf.for
+
+// -----
+
+func.func @dynamic_unittrip_with_destination(%arg_index : index, %arg_value : tensor<8xf16>) -> tensor<4xf16> {
+  %c0 = arith.constant 0 : index
+  %c3 = arith.constant 3 : index
+  %0 = util.assume.int %arg_index<umin = 0, umax = 3> : index
+  %empty = tensor.empty() : tensor<4xf16>
+  %1 = scf.for %arg1 = %c0 to %0 step %c3 iter_args(%arg2 = %empty) -> (tensor<4xf16>) {
+       %extract = tensor.extract_slice %arg_value[%arg1][4][1] : tensor<8xf16> to tensor<4xf16>
+       %2 = arith.negf %extract : tensor<4xf16>
+       scf.yield %2 : tensor<4xf16>
+  }
+  return %1 : tensor<4xf16>
+}
+
+// CHECK-LABEL: func.func @dynamic_unittrip_with_destination
+//  CHECK-SAME: (%[[ARGINDEX:.+]]: index, %[[ARGTENSOR:.+]]: tensor<8xf16>)
+//       CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4xf16>
+//       CHECK: %[[RESULT:.+]] = scf.if
+//       CHECK:   %[[SLICE:.+]] = tensor.extract_slice
+//       CHECK:   %[[NEG:.+]] = arith.negf %[[SLICE]] : tensor<4xf16>
+//       CHECK:   scf.yield %[[NEG]] : tensor<4xf16>
+//       CHECK: } else {
+//       CHECK:   scf.yield %[[EMPTY]] : tensor<4xf16>
+//       CHECK: }
+//       CHECK: return %[[RESULT]] : tensor<4xf16>
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -403,6 +403,7 @@ void addSPIRVCooperativeMatrixVectorizePassPipeline(
 
   // Tile and distribute to GPU subgroups.
   funcPassManager.addPass(createSPIRVTileToCooperativeOpsPass());
+  funcPassManager.addPass(createPropagateDispatchSizeBoundsPass());
   funcPassManager.addPass(createRemoveSingleIterationLoopPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp b/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp
@@ -27,17 +27,39 @@ namespace mlir::iree_compiler {
 
 /// Replaces the given op with the contents of the given single-block region,
 /// using the operands of the block terminator to replace operation results.
-static void replaceOpWithRegion(PatternRewriter &rewriter, Operation *op,
-                                Region &region, ValueRange blockArgs = {}) {
-  assert(llvm::hasSingleElement(region) && "expected single-region block");
-  Block *block = &region.front();
+static void replaceOpWithRegion(PatternRewriter &rewriter, scf::ForOp op,
+                                ValueRange blockArgs = {}) {
+  Block *block = op.getBody();
   Operation *terminator = block->getTerminator();
   ValueRange results = terminator->getOperands();
   rewriter.inlineBlockBefore(block, op, blockArgs);
   rewriter.replaceOp(op, results);
   rewriter.eraseOp(terminator);
 }
 
+/// Same as `replaceOpWithRegion` function but within an scf.if region.
+static void replaceForWithIf(PatternRewriter &rewriter, scf::ForOp op,
+                             ValueRange blockArgs = {}) {
+  Block *block = op.getBody();
+  ValueRange initArgs = op.getInitArgs();
+  Value count =
+      rewriter.create<arith::CmpIOp>(op->getLoc(), arith::CmpIPredicate::sgt,
+                                     op.getUpperBound(), op.getLowerBound());
+  auto ifOp =
+      rewriter.create<scf::IfOp>(op->getLoc(), op.getResultTypes(), count,
+                                 /*withElseRegion=*/initArgs.size() != 0);
+  Operation *terminator = block->getTerminator();
+  rewriter.inlineBlockBefore(block, &ifOp.getThenRegion().front(),
+                             ifOp.getThenRegion().front().begin(), blockArgs);
+  if (initArgs.size() == 0) {
+    rewriter.eraseOp(terminator);
+  } else {
+    rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
+    rewriter.create<scf::YieldOp>(ifOp.getLoc(), initArgs);
+  }
+  rewriter.replaceOp(op, ifOp);
+}
+
 /// Return true if we can prove that the we always run at least the first
 /// iteration of the ForOp.
 static bool alwaysRunsFirstIteration(scf::ForOp op) {
@@ -75,20 +97,22 @@ struct SimplifyTrivialLoops : public OpRewritePattern<scf::ForOp> {
 
   LogicalResult matchAndRewrite(scf::ForOp op,
                                 PatternRewriter &rewriter) const override {
-    // TODO: Handle the case where we know that the loop doesn't run more than
-    // once but the loop may not run at least once by replace the `loop` with an
-    // `if`.
-    if (!(alwaysRunsFirstIteration(op) && neverRunsSecondIteration(op))) {
+    if (!(neverRunsSecondIteration(op))) {
       return failure();
     }
 
-    // The first iteration is always run and the second iteration is never run
-    // so the loop always have 1 iteration. Inline its body and remove the loop.
+    // The second iteration is never run
+    // so the loop atmost can have 1 iteration. Inline its body and remove the
+    // loop.
     SmallVector<Value> blockArgs;
     blockArgs.reserve(op.getInitArgs().size() + 1);
     blockArgs.push_back(op.getLowerBound());
     llvm::append_range(blockArgs, op.getInitArgs());
-    replaceOpWithRegion(rewriter, op, op.getRegion(), blockArgs);
+    if (alwaysRunsFirstIteration(op)) {
+      replaceOpWithRegion(rewriter, op, blockArgs);
+    } else {
+      replaceForWithIf(rewriter, op, blockArgs);
+    }
     return success();
   }
 };