[RemoveLayoutConversions]: Add support for tt.store operation using block ptr updated by tt.advance operation (#4277)

etiotto · web-flow · commit 8f8b91b00403 · 2025-05-23T16:59:23.000Z
This PR adds support for a new "tt.store" operation by updating the
layout conversion process for tensor pointer operations.

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/TritonIntelGPU/combine.mlir b/test/TritonIntelGPU/combine.mlir
@@ -2472,3 +2472,49 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.return
   }
 }
+
+// -----
+
+// COM: Test that the DPAS layout is propagated to the store operation in the presence of an advance operation updating its base pointer.
+// CHECK-NOT: #ttg.blocked<{.*}>
+// CHECK: #[[$DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+#dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}>
+#dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
+  // CHECK-LABEL: matmul_kernel_with_block_pointers
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %c32_i32 = arith.constant 32 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x256xf32, #blocked1>
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x32xf16,  #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<32x256xf16,  #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
+    %18 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x32xf16, #blocked>>
+    %22 = tt.make_tensor_ptr %arg1, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #blocked1>>
+    %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<64x256xf32, #blocked1>, !tt.ptr<tensor<64x32xf16, #blocked>>, !tt.ptr<tensor<32x256xf16, #blocked1>>)  : i32 {
+      // CHECK-NOT: ttg.convert_layout
+      %28 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major" } : !tt.ptr<tensor<64x32xf16, #blocked>>
+      %29 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<32x256xf16, #blocked1>>
+      %36 = ttg.convert_layout %arg10 : tensor<64x256xf32, #blocked1> -> tensor<64x256xf32, #dpas>
+      %30 = ttg.convert_layout %28 : tensor<64x32xf16, #blocked> -> tensor<64x32xf16, #dot0>
+      %31 = ttg.convert_layout %29 : tensor<32x256xf16, #blocked1> -> tensor<32x256xf16, #dot1>
+      %32 = tt.dot %30, %31, %36, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas>
+      %33 = tt.advance %arg11, [%c0_i32, %c32_i32] : <tensor<64x32xf16, #blocked>>
+      %34 = tt.advance %arg12, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #blocked1>>
+      %35 = ttg.convert_layout %32 : tensor<64x256xf32, #dpas> -> tensor<64x256xf32, #blocked1>
+      scf.yield %35, %33, %34 : tensor<64x256xf32, #blocked1>, !tt.ptr<tensor<64x32xf16, #blocked>>, !tt.ptr<tensor<32x256xf16, #blocked1>>
+    }
+    %24 = arith.truncf %23#0 : tensor<64x256xf32, #blocked1> to tensor<64x256xf16, #blocked1>
+    // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[$DPAS]]>>
+    // CHECK: [[PTR2:%.*]] = tt.advance [[PTR1]], {{.*}} : <tensor<64x256xf16, #[[$DPAS]]>>
+    // CHECK: tt.store [[PTR2]], {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #[[$DPAS]]>>
+    %27 = tt.make_tensor_ptr %arg2, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #blocked1>>
+    %newptr = tt.advance %27, [%c32_i32, %c32_i32] : <tensor<64x256xf16, #blocked1>>
+    tt.store %newptr, %24 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #blocked1>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -692,6 +692,29 @@ void LayoutPropagation::rewriteAssertOp(AssertOp assertOp) {
   assertOp->setOperand(0, newOperand);
 }
 
+// Recursively update the operands in a chain of AdvanceOps, after setting the
+// pointer operand of the first one.
+static void updateAdvanceOpChain(AdvanceOp advanceOp, Value makeTensorPtrOp,
+                                 Value dataToStore) {
+  OpBuilder rewriter(advanceOp);
+  auto newAdvanceOp =
+      rewriter.create<AdvanceOp>(advanceOp.getLoc(), makeTensorPtrOp.getType(),
+                                 makeTensorPtrOp, advanceOp.getOffsets());
+
+  SmallVector<Operation *> advanceOpUsers(advanceOp->getUsers());
+  for (Operation *user : advanceOpUsers) {
+    if (auto storeOp = dyn_cast<StoreOp>(user)) {
+      storeOp.setOperand(0, newAdvanceOp);
+      storeOp.setOperand(1, dataToStore);
+    } else if (auto advanceOp = dyn_cast<AdvanceOp>(user)) {
+      updateAdvanceOpChain(advanceOp, makeTensorPtrOp, dataToStore);
+    } else {
+      llvm::errs() << "user: " << *user << "\n";
+      llvm_unreachable("Unexpected user");
+    }
+  }
+}
+
 bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) {
   // Disable 2D block store on LTS.
   if (!storeOp->getParentOfType<ModuleOp>()->hasAttr(
@@ -705,13 +728,16 @@ bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) {
   if (!isTensorPointerType(ptr.getType()))
     return false;
 
-  // 2D block store are preceeded by a MakeTensorPtrOp
-  auto makeTensorPtrOp = ptr.getDefiningOp<MakeTensorPtrOp>();
-  if (!makeTensorPtrOp)
-    return false;
+  // Locate the operation that created the block pointer.
+  Operation *defOp = ptr.getDefiningOp();
+  while (auto advanceOp = dyn_cast<AdvanceOp>(defOp))
+    defOp = advanceOp.getPtr().getDefiningOp();
+  assert(isa<MakeTensorPtrOp>(defOp) &&
+         "MakeTensorPtrOp should be the only op that creates a tensor pointer");
+  auto makeTensorPtrOp = cast<MakeTensorPtrOp>(defOp);
 
-  // DPAS encoding have to be propagate if conversion from DPAS to
-  // other has been done before.
+  // DPAS encoding have to be propagated if conversion from a DPAS layout to
+  // another layout has been done before.
   auto convertOp = storeOp.getValue().getDefiningOp<ConvertLayoutOp>();
   PointerType newPtrType;
   Attribute encoding;
@@ -758,21 +784,26 @@ bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) {
     encoding = convertOpSrcType.getEncoding();
   }
 
-  // We create a new MakeTensorPtrOp with the new data type.
+  // Create a new MakeTensorPtrOp with the new layout.
   OpBuilder rewriter(makeTensorPtrOp);
-  Value newStorePtr = rewriter.create<MakeTensorPtrOp>(
+  Value newMakeTensorPtrOp = rewriter.create<MakeTensorPtrOp>(
       makeTensorPtrOp.getLoc(), newPtrType, makeTensorPtrOp.getBase(),
       makeTensorPtrOp.getShape(), makeTensorPtrOp.getStrides(),
-      makeTensorPtrOp.getOffsets(), rewriter.getDenseI32ArrayAttr({1, 0}));
-
-  // The encoding of the StoreOp is updated with the new
-  // operands:
-  // - the Ptr created by the MakeTensorPtrOp with the new data
-  // type
-  // - the forwarded DPAS encoding.
-  Value newOperand = getValueAs(value, encoding);
-  storeOp.setOperand(0, newStorePtr);
-  storeOp.setOperand(1, newOperand);
+      makeTensorPtrOp.getOffsets(), makeTensorPtrOp.getOrderAttr());
+
+  // Update the store operation with the new layout.
+  SmallVector<Operation *> makeTensorPtrOpUsers(makeTensorPtrOp->getUsers());
+  Value dataToStore = getValueAs(value, encoding);
+  Block *storeBB = storeOp->getBlock();
+  for (Operation *user : makeTensorPtrOpUsers) {
+    Block *userBB = user->getBlock();
+    if (auto storeOp = dyn_cast<StoreOp>(user)) {
+      storeOp.setOperand(0, newMakeTensorPtrOp);
+      storeOp.setOperand(1, dataToStore);
+    } else if (auto advanceOp = dyn_cast<AdvanceOp>(user)) {
+      updateAdvanceOpChain(advanceOp, newMakeTensorPtrOp, dataToStore);
+    }
+  }
 
   // If the DPAS encoding is forwarded, we do not need the
   // convertOp anymore if the convertOp was only used by the
@@ -1607,6 +1638,7 @@ class TritonIntelGPURemoveLayoutConversionsPass
     LLVM_DEBUG({
       DBGS() << "Module after propagating layouts forward:\n";
       m.dump();
+      assert(succeeded(verify(m)) && "Module verification failed");
     });
 
     cleanupConvertOps();
@@ -1617,6 +1649,7 @@ class TritonIntelGPURemoveLayoutConversionsPass
     LLVM_DEBUG({
       DBGS() << "Module after backward remat:\n";
       m.dump();
+      assert(succeeded(verify(m)) && "Module verification failed");
     });
 
     // Cleanup dummy converts created during backward remat.
@@ -1628,6 +1661,7 @@ class TritonIntelGPURemoveLayoutConversionsPass
     LLVM_DEBUG({
       DBGS() << "Module after hoisting converts:\n";
       m.dump();
+      assert(succeeded(verify(m)) && "Module verification failed");
     });
 
     // 4. Apply clean up patterns to remove remove dead convert and dead code
@@ -1643,6 +1677,7 @@ class TritonIntelGPURemoveLayoutConversionsPass
     LLVM_DEBUG({
       DBGS() << "Module after final cleanups:\n";
       m.dump();
+      assert(succeeded(verify(m)) && "Module verification failed");
     });
   }
 };