[BACKEND] Enhance block store lowering (#4544)

chengjunlu · web-flow · commit 93b2d49efe33 · 2025-06-23T13:35:38.000+08:00
The first step is to enable the memory analysis on the pointer value
used by tt.store operation.

In later, enhance the lowering of the tt.store based on the memory
information as the lowering of tt.load.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/test/TritonIntelGPU/materialize-block-pointer.mlir b/test/TritonIntelGPU/materialize-block-pointer.mlir
@@ -18,13 +18,21 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
     %4 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%pitch, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf16, #dot_b>>
     %5 = tt.load %3 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %6 = tt.load %4 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
+    tt.store %3, %5 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
+    tt.store %4, %6 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, ttig.block_io = "column_major"}
     %7 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #dot_a>>
     %8 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>
     %9 = tt.load %7 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %10 = tt.load %8 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %7, %9 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "column_major"}
+    tt.store %8, %10 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
     // COM: Non-constant stride on fast changing dim.
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -33,6 +41,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
     %12 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%pitch, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>
     %13 = tt.load %11 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %14 = tt.load %12 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %11, %13 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %12, %14 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
     // COM: Non-64 divisible pitch.
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -41,6 +53,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
     %16 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch_odd], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>
     %17 = tt.load %15 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %18 = tt.load %16 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %15, %17 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %16, %18 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
     // COM: Non 4 bytes aligned base.
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -49,6 +65,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
     %20 = tt.make_tensor_ptr %arg1, [%c0_i64, %c0_i64], [%pitch, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf16, #dot_b>>
     %21 = tt.load %19 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %22 = tt.load %20 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %19, %21 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %20, %22 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
     // COM: Non 4 bytes aligned baseWidth.
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -57,6 +77,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
     %24 = tt.make_tensor_ptr %arg0, [%c0_i64, %c15_i64], [%pitch, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf16, #dot_b>>
     %25 = tt.load %23 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %26 = tt.load %24 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %23, %25 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %24, %26 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
     // COM: Non 4 bytes aligned offsetX.
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
@@ -65,6 +89,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", ttig.support_sg
     %28 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%pitch, %c1_i64], [%c0_i32, %c15_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf16, #dot_b>>
     %29 = tt.load %27 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %30 = tt.load %28 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %27, %29 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x32xf16, #dot_a>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>}
+    tt.store %28, %30 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x64xf16, #dot_b>>
+
     tt.return
   }
 }
@@ -103,6 +132,8 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     // COM: 4 bytes aligned base (value got from addptr, addi, muli), baseWidth and offsetX (value got from muli).
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, ttig.block_io = "row_major"}
     %11 = tt.load %10 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>
+    // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
+    tt.store %10, %11 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>
     tt.return
   }
 }
@@ -130,6 +161,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
       %14 = tt.advance %12, [%1, %arg3] : <tensor<8x128xf32, #blocked>>
       // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"}
       %15 = tt.load %14 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<8x128xf32, #blocked>>
+      // CHECK: tt.store {{.*}} {boundaryCheck = array<i32: 1>, ttig.block_io = "row_major"}
+      tt.store %14, %15 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<8x128xf32, #blocked>>
       scf.yield %12 : !tt.ptr<tensor<8x128xf32, #blocked>>
     }
     tt.return
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -7,6 +7,7 @@
 #include "mlir/IR/Visitors.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
@@ -35,6 +36,18 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       TritonIntelGPUMaterializeBlockPointerPass>::
       TritonIntelGPUMaterializeBlockPointerBase;
 
+  static Value getPointerFromOp(Operation *op) {
+    return TypeSwitch<Operation *, Value>(op)
+        .Case<tt::LoadOp, tt::StoreOp>([](auto op) { return op.getPtr(); })
+        .Default([&](auto) {
+          llvm_unreachable(
+              +("Invalid operation: " + op->getName().getStringRef())
+                   .str()
+                   .c_str());
+          return Value{};
+        });
+  }
+
   void runOnOperation() override {
     ModuleOp mod = getOperation();
     if (!mod->hasAttr(
@@ -44,21 +57,21 @@ struct TritonIntelGPUMaterializeBlockPointerPass
     tt::intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
 
     MLIRContext *context = &getContext();
-    mod.walk([&](tt::LoadOp loadOp) {
-      LDBG("Considering op: " << loadOp);
+    mod.walk([&](Operation *op) {
+      if (!isa<tt::LoadOp, tt::StoreOp>(op)) {
+        return;
+      }
+      LDBG("Considering op: " << *op);
 
-      Value ptr = loadOp.getPtr();
+      Value ptr = getPointerFromOp(op);
       if (!tt::isTensorPointerType(ptr.getType()))
-        return MaterializeTensorOfPointers(loadOp, axisInfoAnalysis);
-
-      assert(isa<RankedTensorType>(loadOp.getResult().getType()) &&
-             "Expected 'loadOp' to load a tensor value.");
+        return MaterializeTensorOfPointers(op, axisInfoAnalysis);
 
       // Find the make tensor ptr operation that created the base ptr.
       std::optional<tt::MakeTensorPtrOp> defOp =
           tt::intel::findDefiningMakeTensorPtrOp(ptr);
       if (!defOp) {
-        LDBG("Could not find make tensor ptr op for: " << loadOp);
+        LDBG("Could not find make tensor ptr op for: " << *op);
         return;
       }
 
@@ -71,8 +84,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       if (rank == 1)
         return;
 
-      if (!satisfies2DBlockReadAlignment(loadOp, axisInfoAnalysis)) {
-        LDBG("Alignment checks failed for: " << loadOp);
+      if (!satisfies2DBlockReadAlignment(op, axisInfoAnalysis)) {
+        LDBG("Alignment checks failed for: " << *op);
         return;
       }
 
@@ -107,8 +120,7 @@ struct TritonIntelGPUMaterializeBlockPointerPass
           return;
 
         const bool isRowMajor = (strideOneDimVal == rank - 1);
-        std::optional<ttg::DotOperandEncodingAttr> dotLayout =
-            getDotLayout(loadOp);
+        std::optional<ttg::DotOperandEncodingAttr> dotLayout = getDotLayout(op);
         if (dotLayout) {
           // Check if the load is being used by a tt.dot operation, and if so is
           // this the first operand and is it a transposed row major matrix. If
@@ -127,31 +139,33 @@ struct TritonIntelGPUMaterializeBlockPointerPass
           }
         }
 
-        loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
-                        StringAttr::get(context, isRowMajor ? "row_major"
-                                                            : "column_major"));
+        op->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
+                    StringAttr::get(context,
+                                    isRowMajor ? "row_major" : "column_major"));
       }
     });
   }
 
 private:
   void MaterializeTensorOfPointers(
-      tt::LoadOp loadOp,
+      Operation *op,
       tt::intel::ModuleAxisInfoAnalysis &axisInfoAnalysis) const {
-    MLIRContext *context = loadOp.getContext();
-    Value ptr = loadOp.getPtr();
+    MLIRContext *context = op->getContext();
+    Value ptr = getPointerFromOp(op);
     assert(!tt::isTensorPointerType(ptr.getType()) &&
-           "Expected 'loadOp' to load a tensor value.");
+           "Expected pointer refer to a tensor.");
 
     auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
     if (!tensorTy)
       return;
 
-    LDBG("Considering tensor of pointer load op: " << loadOp);
+    LDBG("Considering tensor of pointer of memory accessing op: " << *op);
 
-    if (loadOp.getMask()) {
-      LDBG("Load op has mask, skip block IO attribute");
-      return;
+    if (auto loadOp = dyn_cast<tt::LoadOp>(*op)) {
+      if (loadOp.getMask()) {
+        LDBG("Load op has mask, skip block IO attribute");
+        return;
+      }
     }
 
     // The axis info gives the information about the value of the indices
@@ -215,8 +229,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
     // Check if loadOp is row major, i.e., fast changing dimension is one.
     if (isMajor(1 /*fastChangeDim*/)) {
       LDBG("Setting row_major attribute\n");
-      loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
-                      StringAttr::get(context, "row_major"));
+      op->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
+                  StringAttr::get(context, "row_major"));
     }
 
     // TODO: set column_major attribute
@@ -225,9 +239,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
   // Return the load layout if it is a dot layout. If it is not, check if the
   // load result is converted to a dot layout. If so, return the dot layout,
   // otherwise return nullopt.
-  std::optional<ttg::DotOperandEncodingAttr>
-  getDotLayout(tt::LoadOp loadOp) const {
-    Value ptr = loadOp.getPtr();
+  std::optional<ttg::DotOperandEncodingAttr> getDotLayout(Operation *op) const {
+    Value ptr = getPointerFromOp(op);
     if (!tt::isTensorPointerType(ptr.getType()))
       return std::nullopt;
 
@@ -254,7 +267,7 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       });
     };
 
-    Operation::user_range users = loadOp->getUsers();
+    Operation::user_range users = op->getUsers();
     if (!users.empty() && allUsersAreConvertOps(users) &&
         allUserHaveIdenticalLayout(users)) {
       Attribute firstUserLayout =
@@ -282,13 +295,11 @@ struct TritonIntelGPUMaterializeBlockPointerPass
   }
 
   bool satisfies2DBlockReadAlignment(
-      tt::LoadOp loadOp,
+      Operation *op,
       tt::intel::ModuleAxisInfoAnalysis &axisInfoAnalysis) const {
-    Value ptr = loadOp.getPtr();
+    Value ptr = getPointerFromOp(op);
     assert(tt::isTensorPointerType(ptr.getType()) &&
            "Expected a ptr to a tensor of ptrs.");
-    assert(isa<RankedTensorType>(loadOp.getResult().getType()) &&
-           "Expected 'loadOp' to load a ranked tensor value.");
 
     // Find the make tensor ptr operation that created the base ptr for the load
     // operation.
@@ -350,7 +361,7 @@ struct TritonIntelGPUMaterializeBlockPointerPass
     }
     LDBG("offset: " << offset);
 
-    Region *loadRgn = loadOp->getParentRegion();
+    Region *loadRgn = op->getParentRegion();
     Region *makeTensorPtrRgn = makeTensorPtrOp->getParentRegion();
     bool inSameRegion = (loadRgn == makeTensorPtrRgn);
     if (inSameRegion)