Revert "[MLIR] Improve in-place folding to iterate until fixed-point (llvm#160615)"

amd-eochoalo · MaheshRavishankar · commit 1d17bcd4f767 · 2025-10-16T08:59:38.000-07:00
This reverts commit fcf79e5.
diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h
@@ -40,10 +40,7 @@ class OperationFolder {
   /// deduplicated constants. If successful, replaces `op`'s uses with
   /// folded results, and returns success. If the op was completely folded it is
   /// erased. If it is just updated in place, `inPlaceUpdate` is set to true.
-  /// On success() and when in-place, the folder is invoked until
-  /// `maxIterations` is reached (default INT_MAX).
-  LogicalResult tryToFold(Operation *op, bool *inPlaceUpdate = nullptr,
-                          int maxIterations = INT_MAX);
+  LogicalResult tryToFold(Operation *op, bool *inPlaceUpdate = nullptr);
 
   /// Tries to fold a pre-existing constant operation. `constValue` represents
   /// the value of the constant, and can be optionally passed if the value is
@@ -85,10 +82,7 @@ class OperationFolder {
 
   /// Tries to perform folding on the given `op`. If successful, populates
   /// `results` with the results of the folding.
-  /// On success() and when in-place, the folder is invoked until
-  /// `maxIterations` is reached (default INT_MAX).
-  LogicalResult tryToFold(Operation *op, SmallVectorImpl<Value> &results,
-                          int maxIterations = INT_MAX);
+  LogicalResult tryToFold(Operation *op, SmallVectorImpl<Value> &results);
 
   /// Try to process a set of fold results. Populates `results` on success,
   /// otherwise leaves it unchanged.
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
@@ -14,7 +14,6 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Matchers.h"
 #include "llvm/ADT/SmallVectorExtras.h"
-#include "llvm/Support/DebugLog.h"
 
 using namespace mlir;
 
@@ -487,18 +486,9 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value> &results,
 
   // Try to fold the operation.
   SmallVector<OpFoldResult, 4> foldResults;
-  LDBG() << "Trying to fold: "
-         << OpWithFlags(op, OpPrintingFlags().skipRegions());
   if (failed(op->fold(foldResults)))
     return cleanupFailure();
 
-  int count = 0;
-  do {
-    LDBG() << "Folded in place #" << count
-           << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
-    count++;
-  } while (foldResults.empty() && succeeded(op->fold(foldResults)));
-
   // An in-place fold does not require generation of any constants.
   if (foldResults.empty())
     return success();
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -16,7 +16,6 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Operation.h"
-#include "llvm/Support/DebugLog.h"
 
 using namespace mlir;
 
@@ -68,8 +67,7 @@ static Operation *materializeConstant(Dialect *dialect, OpBuilder &builder,
 // OperationFolder
 //===----------------------------------------------------------------------===//
 
-LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate,
-                                         int maxIterations) {
+LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) {
   if (inPlaceUpdate)
     *inPlaceUpdate = false;
 
@@ -88,7 +86,7 @@ LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate,
 
   // Try to fold the operation.
   SmallVector<Value, 8> results;
-  if (failed(tryToFold(op, results, maxIterations)))
+  if (failed(tryToFold(op, results)))
     return failure();
 
   // Check to see if the operation was just updated in place.
@@ -226,19 +224,10 @@ bool OperationFolder::isFolderOwnedConstant(Operation *op) const {
 /// Tries to perform folding on the given `op`. If successful, populates
 /// `results` with the results of the folding.
 LogicalResult OperationFolder::tryToFold(Operation *op,
-                                         SmallVectorImpl<Value> &results,
-                                         int maxIterations) {
+                                         SmallVectorImpl<Value> &results) {
   SmallVector<OpFoldResult, 8> foldResults;
-  if (failed(op->fold(foldResults)))
-    return failure();
-  int count = 1;
-  do {
-    LDBG() << "Folded in place #" << count
-           << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
-  } while (count++ < maxIterations && foldResults.empty() &&
-           succeeded(op->fold(foldResults)));
-
-  if (failed(processFoldResults(op, results, foldResults)))
+  if (failed(op->fold(foldResults)) ||
+      failed(processFoldResults(op, results, foldResults)))
     return failure();
   return success();
 }
diff --git a/mlir/test/Dialect/Arith/constant-fold.mlir b/mlir/test/Dialect/Arith/constant-fold.mlir
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -7,8 +7,10 @@ gpu.module @test {
     //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
     //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
+    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
     //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -21,8 +23,10 @@ gpu.module @test {
     //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
     //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
+    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
     //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -27,10 +27,12 @@ gpu.module @test_round_robin_assignment {
     //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
     //CHECK: [[C0:%.+]] = arith.constant 0 : index
     //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
+    //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
+    //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
     //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
-    //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
+    //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
     //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -330,10 +330,12 @@ gpu.module @test_distribution {
     //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y_0:%.+]] = arith.addi [[l_off_y]], [[c0]] : index
+    //CHECK: [[l_off_x_0:%.+]] = arith.addi [[l_off_x]], [[c0_1]] : index
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
-    //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
+    //CHECK: [[off_y:%.+]] = index.remu [[l_off_y_0]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
+    //CHECK: [[off_x:%.+]] = index.remu [[l_off_x_0]], [[c128]]
     //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
     %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
@@ -352,11 +354,13 @@ gpu.module @test_distribution {
     //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
     //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
+    //CHECK: [[l_off_y_0:%.+]] = index.mul [[id_y]], [[c32]]
     //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
+    //CHECK: [[l_off_x_0:%.+]] = index.mul [[id_x]], [[c32_1]]
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c0_2:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = arith.addi [[l_off_y_0]], [[c0]] : index
+    //CHECK: [[l_off_x:%.+]] = arith.addi [[l_off_x_0]], [[c0_2]] : index
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -413,10 +417,11 @@ gpu.module @test_distribution {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
     //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
     //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
     //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
+    //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
     //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -430,8 +435,9 @@ gpu.module @test_distribution {
     //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
     //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
     //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
+    //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
     //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
     //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -14,10 +14,12 @@ gpu.module @test_1_1_assignment {
     //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
     //CHECK: [[C0:%.+]] = arith.constant 0 : index
     //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+    //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
     //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
+    //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
+    //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
     //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -35,13 +37,17 @@ gpu.module @test_1_1_assignment {
     //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
     //CHECK: [[C0:%.+]] = arith.constant 0 : index
     //CHECK: [[C0_2:%.+]] = arith.constant 0 : index
+    //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+    //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index
     //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
+    //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]]
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
+    //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]]
     //CHECK: [[C0_3:%.+]] = arith.constant 0 : index
+    //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]]
     //CHECK: [[C0_4:%.+]] = arith.constant 0 : index
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]]
+    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
diff --git a/mlir/test/lib/Transforms/TestSingleFold.cpp b/mlir/test/lib/Transforms/TestSingleFold.cpp
@@ -26,9 +26,6 @@ struct TestSingleFold : public PassWrapper<TestSingleFold, OperationPass<>>,
                         public RewriterBase::Listener {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSingleFold)
 
-  TestSingleFold() = default;
-  TestSingleFold(const TestSingleFold &pass) : PassWrapper(pass) {}
-
   StringRef getArgument() const final { return "test-single-fold"; }
   StringRef getDescription() const final {
     return "Test single-pass operation folding and dead constant elimination";
@@ -48,18 +45,13 @@ struct TestSingleFold : public PassWrapper<TestSingleFold, OperationPass<>>,
     if (it != existingConstants.end())
       existingConstants.erase(it);
   }
-
-  Option<int> maxIterations{*this, "max-iterations",
-                            llvm::cl::desc("Max iterations in the tryToFold"),
-                            llvm::cl::init(1)};
 };
 } // namespace
 
 void TestSingleFold::foldOperation(Operation *op, OperationFolder &helper) {
   // Attempt to fold the specified operation, including handling unused or
   // duplicated constants.
-  bool inPlaceUpdate = false;
-  (void)helper.tryToFold(op, &inPlaceUpdate, maxIterations);
+  (void)helper.tryToFold(op);
 }
 
 void TestSingleFold::runOnOperation() {