[CodeGen] Improve scf.for bufferization and make hoisting allocation work

hanhanW · hanhanW · commit 02e56788559b · 2026-01-28T17:55:30.000-08:00
The revision enables `allowReturnAllocsFromLoops` in bufferization, which matches the upstream behavior. Otherwise, it can trigger an error like: ``` error: Yield operand #1 is not equivalent to the corresponding iter bbArg ``` In this context, a `memref.alloca` can be created inside the loop and the dynamic size can be queried from iter_arg. The ValueBoundsConstraintSet check does not support the analysis, because the runtime dimension values can still differ. E.g., ```mlir %result = scf.for ... iter_args(%iter = %init) -> (memref<?xf32>) { %new_buf = memref.alloca(%some_other_size) : memref<?xf32> scf.yield %new_buf : memref<?xf32> // same type, different runtime size } ``` It is weird, but it is allowed. Thus, we need to handle such case in `hoistOneStaticallyBoundAllocation`. The revision verifies the dimension is preserved, via: 1. The yield operand (after walking through cast/subview) is the iter_arg. 2. The yield operand traces to an alloca whose shape matches the iter_arg and whose dynamic size at `dimIndex` is `memref.dim` of the iter_arg. 3. The yield operand is a scf.for result whose init arg is the iter_arg and the inner loop also preserves the dimension (recursive). Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
@@ -151,6 +151,13 @@ static IREEOneShotBufferizationOptions getBufferizationOptions() {
   // as is and insert bufferization.to_buffer to convert the tensor to memref.
   options.opFilter.denyOperation<arith::ConstantOp>();
 
+  // Allow returning allocs from loops. This is needed for patterns like online
+  // attention where scf.for yield operands cannot be buffer-equivalent to their
+  // corresponding iter bbArgs (e.g., the new max value is computed from both
+  // the old max and new data). This matches MLIR upstream's
+  // EmptyTensorElimination pass behavior.
+  options.allowReturnAllocsFromLoops = true;
+
   // This type converter converts tensor types to memref types when no exact
   // memref type can be inferred from the context.
   options.unknownTypeConverterFn = [](TensorType tensorType,
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/hoist_statically_bound_allocations.mlir b/compiler/src/iree/compiler/Codegen/Common/test/hoist_statically_bound_allocations.mlir
@@ -249,3 +249,140 @@ func.func @nested_op_scalable_alloc_linalg_use(%arg0 : index) {
 // CHECK-UNBOUNDED-VSCALE-LABEL: func @nested_op_scalable_alloc_linalg_use(
 //       CHECK-UNBOUNDED-VSCALE: scf.for
 //       CHECK-UNBOUNDED-VSCALE:   memref.alloc
+
+// -----
+
+// The yield is the iter_arg itself — dimension trivially preserved. The
+// alloca's size comes from memref.dim of the iter_arg, and
+// computeAllocationBound traces through the loop to the init value.
+func.func @hoist_alloca_yield_iter_arg(%arg0 : index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %static = memref.alloca() : memref<1x4xf32>
+  %sv = memref.subview %static[0, 0][1, %arg0][1, 1]
+      : memref<1x4xf32> to memref<1x?xf32, strided<[4, 1]>>
+  %init = memref.cast %sv
+      : memref<1x?xf32, strided<[4, 1]>> to memref<1x?xf32, strided<[?, ?], offset: ?>>
+  %result = scf.for %i = %c0 to %arg0 step %c1
+      iter_args(%iter = %init) -> (memref<1x?xf32, strided<[?, ?], offset: ?>>) {
+    %dim = memref.dim %iter, %c1 : memref<1x?xf32, strided<[?, ?], offset: ?>>
+    %alloca = memref.alloca(%dim) : memref<1x?xf32>
+    linalg.fill ins(%cst : f32) outs(%alloca : memref<1x?xf32>)
+    scf.yield %iter : memref<1x?xf32, strided<[?, ?], offset: ?>>
+  }
+  return
+}
+// CHECK-LABEL: func @hoist_alloca_yield_iter_arg(
+//       CHECK:   %[[HOISTED:.+]] = memref.alloca() : memref<1x4xf32>
+//       CHECK:   scf.for
+//   CHECK-NOT:     memref.alloca(
+//       CHECK:     %[[DIM:.+]] = memref.dim
+//       CHECK:     %[[SV:.+]] = memref.subview %[[HOISTED]][0, 0] [1, %[[DIM]]] [1, 1]
+//       CHECK:     linalg.fill
+
+// -----
+
+// The yield traces through cast and subview to an alloca whose dynamic size at
+// dimIndex is memref.dim of the iter_arg (self-referential). This exercises the
+// cast/subview walk in the function.
+func.func @hoist_alloca_yield_self_ref_subview(%arg0 : index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %static = memref.alloca() : memref<1x4xf32>
+  %sv = memref.subview %static[0, 0][1, %arg0][1, 1]
+      : memref<1x4xf32> to memref<1x?xf32, strided<[4, 1]>>
+  %init = memref.cast %sv
+      : memref<1x?xf32, strided<[4, 1]>> to memref<1x?xf32, strided<[?, ?], offset: ?>>
+  %result = scf.for %i = %c0 to %arg0 step %c1
+      iter_args(%iter = %init) -> (memref<1x?xf32, strided<[?, ?], offset: ?>>) {
+    %dim = memref.dim %iter, %c1 : memref<1x?xf32, strided<[?, ?], offset: ?>>
+    %alloca = memref.alloca(%dim) : memref<1x?xf32>
+    linalg.fill ins(%cst : f32) outs(%alloca : memref<1x?xf32>)
+    %val = memref.load %alloca[%c0, %c0] : memref<1x?xf32>
+    // Yield traces: cast → subview → alloca (exercises the walk loop).
+    %sub = memref.subview %alloca[0, 0][1, %dim][1, 1]
+        : memref<1x?xf32> to memref<1x?xf32, strided<[?, 1]>>
+    %cast = memref.cast %sub
+        : memref<1x?xf32, strided<[?, 1]>> to memref<1x?xf32, strided<[?, ?], offset: ?>>
+    scf.yield %cast : memref<1x?xf32, strided<[?, ?], offset: ?>>
+  }
+  return
+}
+// CHECK-LABEL: func @hoist_alloca_yield_self_ref_subview(
+//       CHECK:   %[[HOISTED:.+]] = memref.alloca() : memref<1x4xf32>
+//       CHECK:   scf.for
+//   CHECK-NOT:     memref.alloca(
+//       CHECK:     %[[DIM:.+]] = memref.dim
+//       CHECK:     %[[SV:.+]] = memref.subview %[[HOISTED]][0, 0] [1, %[[DIM]]] [1, 1]
+//       CHECK:     linalg.fill
+//       CHECK:     memref.load
+
+// -----
+
+// The yield is an inner scf.for result. The inner loop preserves the dimension
+// via the case that yield is iter_arg, and the recursive check verifies the
+// inner loop.
+func.func @hoist_alloca_yield_nested_loop(%arg0 : index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %static = memref.alloca() : memref<1x4xf32>
+  %sv = memref.subview %static[0, 0][1, %arg0][1, 1]
+      : memref<1x4xf32> to memref<1x?xf32, strided<[4, 1]>>
+  %init = memref.cast %sv
+      : memref<1x?xf32, strided<[4, 1]>> to memref<1x?xf32, strided<[?, ?], offset: ?>>
+  %result = scf.for %i = %c0 to %arg0 step %c1
+      iter_args(%outer_iter = %init) -> (memref<1x?xf32, strided<[?, ?], offset: ?>>) {
+    %inner = scf.for %j = %c0 to %arg0 step %c1
+        iter_args(%inner_iter = %outer_iter) -> (memref<1x?xf32, strided<[?, ?], offset: ?>>) {
+      %dim = memref.dim %inner_iter, %c1 : memref<1x?xf32, strided<[?, ?], offset: ?>>
+      %alloca = memref.alloca(%dim) : memref<1x?xf32>
+      linalg.fill ins(%cst : f32) outs(%alloca : memref<1x?xf32>)
+      scf.yield %inner_iter : memref<1x?xf32, strided<[?, ?], offset: ?>>
+    }
+    scf.yield %inner : memref<1x?xf32, strided<[?, ?], offset: ?>>
+  }
+  return
+}
+// CHECK-LABEL: func @hoist_alloca_yield_nested_loop(
+//       CHECK:   %[[HOISTED:.+]] = memref.alloca() : memref<1x4xf32>
+//       CHECK:   scf.for
+//       CHECK:     scf.for
+//   CHECK-NOT:       memref.alloca(
+//       CHECK:       %[[DIM:.+]] = memref.dim
+//       CHECK:       %[[SV:.+]] = memref.subview %[[HOISTED]][0, 0] [1, %[[DIM]]] [1, 1]
+//       CHECK:       linalg.fill
+
+// -----
+
+// Negative test: the yield uses an alloca sized by a different value (%arg1)
+// rather than the iter_arg's dimension, so the dimension is not preserved
+// across iterations. The alloca should NOT be hoisted.
+func.func @no_hoist_alloca_yield_dim_not_preserved(%arg0 : index, %arg1 : index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %static = memref.alloca() : memref<1x4xf32>
+  %sv = memref.subview %static[0, 0][1, %arg0][1, 1]
+      : memref<1x4xf32> to memref<1x?xf32, strided<[4, 1]>>
+  %init = memref.cast %sv
+      : memref<1x?xf32, strided<[4, 1]>> to memref<1x?xf32, strided<[?, ?], offset: ?>>
+  %result = scf.for %iv = %c0 to %arg0 step %c1
+      iter_args(%iter = %init) -> (memref<1x?xf32, strided<[?, ?], offset: ?>>) {
+    %dim = memref.dim %iter, %c1 : memref<1x?xf32, strided<[?, ?], offset: ?>>
+    %inner = memref.alloca(%dim) : memref<1x?xf32>
+    linalg.fill ins(%cst : f32) outs(%inner : memref<1x?xf32>)
+    // Yield an alloca with a different size — dimension not preserved.
+    %other = memref.alloca(%arg1) : memref<1x?xf32>
+    %cast = memref.cast %other
+        : memref<1x?xf32> to memref<1x?xf32, strided<[?, ?], offset: ?>>
+    scf.yield %cast : memref<1x?xf32, strided<[?, ?], offset: ?>>
+  }
+  return
+}
+// CHECK-LABEL: func @no_hoist_alloca_yield_dim_not_preserved(
+//       CHECK:   scf.for
+//       CHECK:     %[[DIM:.+]] = memref.dim
+//       CHECK:     memref.alloca(%[[DIM]]) : memref<1x?xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
@@ -3220,3 +3220,45 @@ func.func @drop_fusion_barrier() -> memref<6xf32> {
 // CHECK-LABEL: func.func @drop_fusion_barrier
 // CHECK:         %[[ALLOC:.+]] = memref.alloc() : memref<6xf32>
 // CHECK:         return %[[ALLOC]]
+
+// -----
+
+// Regression test for https://github.com/iree-org/iree/issues/16956.
+// The yield operand %new is not buffer-equivalent to the iter bbArg %arg
+// because %arg is read after %new is computed. With allowReturnAllocsFromLoops,
+// bufferization allocates a new buffer inside the loop instead of failing.
+func.func @bufferize_non_equivalent_scf_yield() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<4xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<4xf32>) -> tensor<4xf32>
+  %0 = scf.for %iv = %c0 to %c4 step %c1
+      iter_args(%arg = %fill) -> (tensor<4xf32>) {
+    %new = linalg.generic {
+        indexing_maps = [affine_map<(d0) -> (d0)>],
+        iterator_types = ["parallel"]}
+        outs(%arg : tensor<4xf32>) {
+    ^bb0(%out: f32):
+      %v = arith.addf %out, %cst : f32
+      linalg.yield %v : f32
+    } -> tensor<4xf32>
+    // Reading %arg after %new forces non-equivalence.
+    %use = linalg.generic {
+        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+        iterator_types = ["parallel"]}
+        ins(%new : tensor<4xf32>) outs(%arg : tensor<4xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %s = arith.subf %out, %in : f32
+      linalg.yield %s : f32
+    } -> tensor<4xf32>
+    scf.yield %new : tensor<4xf32>
+  }
+  return
+}
+// CHECK-LABEL: func.func @bufferize_non_equivalent_scf_yield
+// CHECK:         %[[INIT:.+]] = memref.alloc() : memref<4xf32>
+// CHECK:         scf.for {{.*}} iter_args(%[[ARG:.+]] = %[[INIT]])
+// CHECK:           %[[NEW:.+]] = memref.alloc() : memref<4xf32>
+// CHECK:           scf.yield %[[NEW]]
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp
@@ -30,6 +30,7 @@
 #include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
@@ -118,6 +119,81 @@ cloneOffsetsSizesAndStrides(OpBuilder &builder,
       loadOp.getMixedSizes(), loadOp.getMixedStrides(), loadOp.getSourceDims());
 }
 
+/// Returns true if the yield operand for the `argIdx`-th iter_arg of `forOp`
+/// preserves the dimension at `dimIndex`. This is needed to verify that
+/// computing an allocation bound from the init value is sound — if the yield
+/// could produce a larger dimension, the init-derived bound would be too small.
+/// Verify the dimension is preserved, via:
+///
+/// (1) The yield operand (after walking through cast/subview) is the iter_arg.
+/// (2) The yield operand traces to an alloca whose shape matches the iter_arg
+///     and whose dynamic size at `dimIndex` is `memref.dim` of the iter_arg.
+/// (3) The yield operand is a scf.for result whose init arg is the iter_arg
+///     and the inner loop also preserves the dimension (recursive).
+///
+/// Note: This may revisit inner loops when called at each nesting level during
+/// the source walk in computeAllocationBound. Caching would help if the nesting
+/// depth were large, but in practice it is bounded by the tensor rank.
+static bool isYieldDimPreserved(scf::ForOp forOp, unsigned argIdx,
+                                int64_t dimIndex) {
+  BlockArgument iterArg = forOp.getRegionIterArg(argIdx);
+  auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
+  Value yieldVal = yieldOp.getOperand(argIdx);
+
+  // Walk through cast/subview to find the underlying source.
+  while (true) {
+    if (auto castOp = yieldVal.getDefiningOp<memref::CastOp>()) {
+      yieldVal = castOp.getSource();
+      continue;
+    }
+    if (auto subviewOp = yieldVal.getDefiningOp<memref::SubViewOp>()) {
+      yieldVal = subviewOp.getSource();
+      continue;
+    }
+    break;
+  }
+
+  // Case 1: Yield is the iter_arg itself — dimension trivially invariant.
+  if (yieldVal == iterArg) {
+    return true;
+  }
+
+  // Case 2: Yield traces to an alloca whose dynamic size at dimIndex comes
+  // from memref.dim of the same iter_arg (self-referential invariance).
+  if (auto allocaOp = yieldVal.getDefiningOp<memref::AllocaOp>()) {
+    MemRefType allocType = allocaOp.getType();
+    auto iterArgType = cast<MemRefType>(iterArg.getType());
+    // Shape comparison ensures same rank and same static/dynamic pattern,
+    // so we can directly index the dynamic sizes by counting dynamic dims
+    // before dimIndex.
+    if (allocType.getShape() != iterArgType.getShape()) {
+      return false;
+    }
+    unsigned dynIdx = llvm::count_if(allocType.getShape().take_front(dimIndex),
+                                     ShapedType::isDynamic);
+    auto dimOp =
+        allocaOp.getDynamicSizes()[dynIdx].getDefiningOp<memref::DimOp>();
+    if (!dimOp || dimOp.getSource() != iterArg) {
+      return false;
+    }
+    auto idx = dimOp.getConstantIndex();
+    return idx && *idx == dimIndex;
+  }
+
+  // Case 3: Yield is a scf.for result whose init arg at the same index is
+  // the iter_arg, and the inner loop also preserves the dimension.
+  if (auto result = dyn_cast<OpResult>(yieldVal)) {
+    if (auto innerFor = dyn_cast<scf::ForOp>(result.getOwner())) {
+      unsigned resultIdx = result.getResultNumber();
+      if (innerFor.getInitArgs()[resultIdx] == iterArg) {
+        return isYieldDimPreserved(innerFor, resultIdx, dimIndex);
+      }
+    }
+  }
+
+  return false;
+}
+
 template <typename AllocLikeOpType>
 std::optional<Value> hoistOneStaticallyBoundAllocation(
     mlir::FunctionOpInterface funcOp, OpBuilder &builder, Location loc,
@@ -156,30 +232,81 @@ std::optional<Value> hoistOneStaticallyBoundAllocation(
           vector::ScalableValueBoundsConstraintSet::computeScalableBound(
               value, std::nullopt, vscaleRange->vscaleMin,
               vscaleRange->vscaleMax, presburger::BoundType::UB);
-      if (failed(ub)) {
-        return failure();
-      }
+      if (succeeded(ub)) {
+        if (ub->map.isSingleConstant()) {
+          auto constantBound = ub->map.getSingleConstantResult();
+          return OpFoldResult(builder.getIndexAttr(constantBound));
+        }
 
-      if (ub->map.isSingleConstant()) {
-        auto constantBound = ub->map.getSingleConstantResult();
-        return OpFoldResult(builder.getIndexAttr(constantBound));
+        if (!vscale) {
+          vscale = vector::VectorScaleOp::create(builder, loc);
+        }
+        return affine::materializeComputedBound(
+            builder, loc, ub->map, {std::make_pair(vscale, std::nullopt)});
       }
+    } else {
+      // Non-scalable target: Assume everything is fixed-size.
+      auto ub = ValueBoundsConstraintSet::computeConstantBound(
+          presburger::BoundType::UB, {value, std::nullopt},
+          /*stopCondition=*/nullptr,
+          /*closedUB=*/true);
+      if (succeeded(ub)) {
+        return OpFoldResult(builder.getIndexAttr(*ub));
+      }
+    }
 
-      if (!vscale) {
-        vscale = vector::VectorScaleOp::create(builder, loc);
+    // Special case for memref.dim. If the value is a memref.dim on a loop
+    // iter_arg, try computing the bound using the init value's dimension. This
+    // handles cases where bufferization creates loop-internal allocas with
+    // sizes derived from iter_arg dimensions (e.g., from issue #16956,
+    // allowReturnAllocsFromLoops, etc). The value bounds analysis cannot trace
+    // through scf.for iter_args, so we walk up to the outermost init value and
+    // compute the bound from there.
+    auto dimOp = value.getDefiningOp<memref::DimOp>();
+    if (!dimOp) {
+      return failure();
+    }
+    std::optional<int64_t> constIndex = dimOp.getConstantIndex();
+    if (!constIndex) {
+      return failure();
+    }
+
+    // Walk up through nested loop iter_args, casts, and subviews to find a
+    // value whose dimension bound can be computed.
+    Value source = dimOp.getSource();
+    while (true) {
+      if (auto blockArg = dyn_cast<BlockArgument>(source)) {
+        auto forOp = dyn_cast<scf::ForOp>(blockArg.getOwner()->getParentOp());
+        if (!forOp) {
+          break;
+        }
+        unsigned argIdx = blockArg.getArgNumber() - forOp.getNumInductionVars();
+        if (isYieldDimPreserved(forOp, argIdx, *constIndex)) {
+          source = forOp.getInitArgs()[argIdx];
+          continue;
+        }
       }
-      return affine::materializeComputedBound(
-          builder, loc, ub->map, {std::make_pair(vscale, std::nullopt)});
+      if (auto castOp = source.getDefiningOp<memref::CastOp>()) {
+        source = castOp.getSource();
+        continue;
+      }
+      if (auto subviewOp = source.getDefiningOp<memref::SubViewOp>()) {
+        source = subviewOp.getSource();
+        continue;
+      }
+      break;
+    }
+    if (source == dimOp.getSource()) {
+      return failure();
     }
-    // Non-scalable target: Assume everything is fixed-size.
+
     auto ub = ValueBoundsConstraintSet::computeConstantBound(
-        presburger::BoundType::UB, {value, std::nullopt},
+        presburger::BoundType::UB, {source, *constIndex},
         /*stopCondition=*/nullptr,
         /*closedUB=*/true);
     if (failed(ub)) {
       return failure();
     }
-
     return OpFoldResult(builder.getIndexAttr(*ub));
   };
 
@@ -264,8 +391,8 @@ std::optional<Value> hoistOneStaticallyBoundAllocation(
 /// non-trivial because of compatibility between types of different SSA values.
 static bool isUseReplaceableWithSubview(OpOperand &use) {
   Operation *user = use.getOwner();
-  return isa<linalg::LinalgOp, memref::DeallocOp, memref::StoreOp,
-             memref::SubViewOp>(user);
+  return isa<linalg::LinalgOp, memref::CastOp, memref::DeallocOp,
+             memref::LoadOp, memref::StoreOp, memref::SubViewOp>(user);
 }
 
 template <typename AllocLikeOpType>