[RemoveLayoutConversions]: Reduce loop carried values - part 2 (#4921)

etiotto · web-flow · commit cfb23d7ab538 · 2025-09-04T08:38:32.000-04:00
Implements functionality to reduce loop carried values in the
`RemoveLayoutConversions` pass by eliminating unnecessary loop-carried
tensor pointer values when they can be reconstructed from other values
plus layout conversions.

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
@@ -349,6 +349,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32}
 // -----
 
 // CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
+// CHECK: #[[BLOCKED1:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
 // CHECK: #[[DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
@@ -396,6 +397,14 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
     %30 = tt.load %29 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major" } : !tt.ptr<tensor<64x32xf16, #blocked>>
     tt.store %23#1, %30 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x32xf16, #blocked>>
 
+    // CHECK: [[ADV:%.*]] = tt.advance [[LOOP_RES]]#2, {{.*}} : <tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+    // CHECK: [[LOAD3:%.*]] = tt.load [[ADV]] {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+    // CHECK: [[CONV3:%.*]] = ttg.convert_layout [[LOAD3]] : tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>> -> tensor<32x256xf16, #[[BLOCKED1]]>
+    // CHECK: tt.store {{.*}}, [[CONV3]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #[[BLOCKED1]]>
+    %31 = tt.advance %23#2, [%c0_i32, %c32_i32] : <tensor<32x256xf16, #blocked1>>
+    %32 = tt.load %31 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major" } : !tt.ptr<tensor<32x256xf16, #blocked1>>
+    %33 = tt.make_tensor_ptr %arg2, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #blocked1>>
+    tt.store %33, %32 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #blocked1>>
     tt.return
   }
 }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -167,6 +167,7 @@ class LayoutRematerialization {
 
 private:
   void updateRematMapping(SmallVector<std::tuple<Value, Value>> &values);
+  void reduceLoopCarriedValues();
   // Existing tuples of (value, layout) that needs to be updated when recreating
   // scf ops. This prevents keeping track of Values that have been delete when
   // rewriting slices.
@@ -1009,6 +1010,93 @@ void LayoutRematerialization::updateRematMapping(
   }
 }
 
+/// Reduce loop carried values if the value is used after the loop and can be
+/// removed by using another loop yielded value plus a convert layout operation.
+void LayoutRematerialization::reduceLoopCarriedValues() {
+  for (auto [pair, val] : rematMapping) {
+    auto arg = dyn_cast<BlockArgument>(pair.first);
+    if (!arg)
+      continue;
+
+    if (!isTensorPointerType(arg.getType()))
+      continue;
+
+    auto loopOp = dyn_cast<LoopLikeOpInterface>(arg.getOwner()->getParentOp());
+    if (!loopOp)
+      continue;
+
+    // Loop arguments that corresponds to a loop result which is not used are
+    // not interesting.
+    OpResult loopRes = loopOp.getTiedLoopResult(arg);
+    if (loopRes.getNumUses() == 0)
+      continue;
+
+    std::function<void(Operation *, Value)> processUser = [&](Operation *user,
+                                                              Value rematRes) {
+      Location loc = user->getLoc();
+      OpBuilder rewriter(user);
+
+      TypeSwitch<Operation *>(user)
+          .Case<LoadOp>([&](auto loadOp) {
+            auto newLoadOp =
+                rewriter.create<LoadOp>(loc, rematRes, loadOp->getAttrs());
+            auto convOp = rewriter.create<ConvertLayoutOp>(
+                loc, loadOp.getType(), newLoadOp.getResult());
+            loadOp->replaceAllUsesWith(convOp);
+            opToDelete.insert(loadOp);
+            LLVM_DEBUG({
+              DBGS() << "Replaced:\n\t" << *loadOp << "\n"
+                     << "with:\n\t" << *newLoadOp << "\n"
+                     << "\t" << *convOp << "\n";
+            });
+          })
+          .Case<StoreOp>([&](auto storeOp) {
+            Value data = storeOp.getOperand(1);
+            auto dataType = cast<RankedTensorType>(data.getType());
+            auto newPtrType = cast<PointerType>(rematRes.getType());
+            Attribute encoding =
+                cast<RankedTensorType>(newPtrType.getPointeeType())
+                    .getEncoding();
+            RankedTensorType newDataType = dataType.cloneWithEncoding(encoding);
+            auto convOp =
+                rewriter.create<ConvertLayoutOp>(loc, newDataType, data);
+            auto newStoreOp = rewriter.create<StoreOp>(
+                loc, rematRes, convOp, storeOp.getBoundaryCheck(),
+                storeOp.getCache(), storeOp.getEvict());
+            opToDelete.insert(storeOp);
+            LLVM_DEBUG({
+              DBGS() << "Replaced:\n\t" << *storeOp << "\n"
+                     << "with:\n\t" << *convOp << "\n"
+                     << "\t" << *newStoreOp << "\n";
+            });
+          })
+          .Case<AdvanceOp>([&](auto advanceOp) {
+            auto newAdvanceOp = rewriter.create<AdvanceOp>(
+                loc, rematRes.getType(), rematRes, advanceOp.getOffsets());
+            opToDelete.insert(advanceOp);
+            LLVM_DEBUG({
+              DBGS() << "Replaced:\n\t" << *advanceOp << "\n"
+                     << "with:\n\t" << *newAdvanceOp << "\n";
+            });
+
+            for (Operation *user : advanceOp->getUsers())
+              processUser(user, newAdvanceOp.getResult());
+          })
+          .Default([](auto op) {
+            llvm::report_fatal_error(llvm::Twine(
+                "Unsupported operation in backward rematerialization: '" +
+                op->getName().getStringRef() + "'"));
+          });
+    };
+
+    // Replace the loop result corresponding to the argument with an
+    // equivalent loop result.
+    OpResult rematRes = loopOp.getTiedLoopResult(cast<BlockArgument>(val));
+    for (Operation *user : loopRes.getUsers())
+      processUser(user, rematRes);
+  }
+}
+
 void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
                                            DenseMap<Value, Attribute> &layout,
                                            ConvertLayoutOp convertOp,
@@ -1269,76 +1357,7 @@ void LayoutRematerialization::backwardRematerialization() {
     }
   }
 
-  // Reduce loop carried values if the value can be removed by using another
-  // loop yielded value plus a convert layout operation.
-  for (auto [pair, val] : rematMapping) {
-    auto arg = dyn_cast<BlockArgument>(pair.first);
-    if (!arg)
-      continue;
-
-    if (!isTensorPointerType(arg.getType()))
-      continue;
-
-    if (auto loopOp =
-            dyn_cast<LoopLikeOpInterface>(arg.getOwner()->getParentOp())) {
-      // Loop arguments that corresponds to a loop result which is not used are
-      // not interesting.
-      OpResult loopRes = loopOp.getTiedLoopResult(arg);
-      if (loopRes.getNumUses() == 0)
-        continue;
-
-      // Replace the loop result corresponding to the argument with an
-      // equivalent loop result.
-      auto rematArg = cast<BlockArgument>(val);
-      OpResult rematRes = loopOp.getTiedLoopResult(rematArg);
-
-      for (Operation *user : loopRes.getUsers()) {
-        Location loc = user->getLoc();
-        OpBuilder rewriter(user);
-
-        TypeSwitch<Operation *>(user)
-            .Case<LoadOp>([&](auto loadOp) {
-              auto newLoadOp =
-                  rewriter.create<LoadOp>(loc, rematRes, loadOp->getAttrs());
-              auto convOp = rewriter.create<ConvertLayoutOp>(
-                  loc, loadOp.getType(), newLoadOp.getResult());
-              loadOp->replaceAllUsesWith(convOp);
-              opToDelete.insert(loadOp);
-              LLVM_DEBUG({
-                DBGS() << "Replaced:\n\t" << *loadOp << "\n";
-                DBGS() << "with:\n\t" << *newLoadOp << "\n"
-                       << "\t" << *convOp << "\n";
-              });
-            })
-            .Case<StoreOp>([&](auto storeOp) {
-              Value data = storeOp.getOperand(1);
-              auto dataType = cast<RankedTensorType>(data.getType());
-              auto newPtrType = cast<PointerType>(rematRes.getType());
-              Attribute encoding =
-                  cast<RankedTensorType>(newPtrType.getPointeeType())
-                      .getEncoding();
-              RankedTensorType newDataType =
-                  dataType.cloneWithEncoding(encoding);
-              auto convOp =
-                  rewriter.create<ConvertLayoutOp>(loc, newDataType, data);
-              auto newStoreOp = rewriter.create<StoreOp>(
-                  loc, rematRes, convOp, storeOp.getBoundaryCheck(),
-                  storeOp.getCache(), storeOp.getEvict());
-              opToDelete.insert(storeOp);
-              LLVM_DEBUG({
-                DBGS() << "Replaced:\n\t" << *storeOp << "\n";
-                DBGS() << "with:\n\t" << *convOp << "\n"
-                       << "\t" << *newStoreOp << "\n";
-              });
-            })
-            .Default([](auto op) {
-              llvm::report_fatal_error(llvm::Twine(
-                  "Unsupported operation in backward rematerialization: '" +
-                  op->getName().getStringRef() + "'"));
-            });
-      }
-    }
-  }
+  reduceLoopCarriedValues();
 }
 
 void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() {