From 2ea0f34498afa02d18b32e5d6b27c635cad881b6 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 3 Oct 2024 15:33:43 +0000
Subject: [PATCH 01/19] Use block load attribute to remove duplicate logic from
 MaterializeBlockPointer pass

---
 .../RewriteTensorPointer.cpp                  | 82 ++++++-------------
 1 file changed, 26 insertions(+), 56 deletions(-)
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 8019823203..68ee43e647 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -33,7 +33,7 @@ namespace {
 ///   - it does not have Dpas layout or Dot layout (with Dpas layout as parent)
 ///   - its pitch is not divisible by Qword bitwidth
 ///   - it is not contiguous in memory
-bool shouldRemove(tt::MakeTensorPtrOp &op, bool isUsedByLoadOrStoreOp) {
+bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByLoadOrStoreOp) {
   LDBG("Considering removal of: " << op);
   if (!op->getParentOfType<ModuleOp>()->hasAttr(
           ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) {
@@ -52,55 +52,7 @@ bool shouldRemove(tt::MakeTensorPtrOp &op, bool isUsedByLoadOrStoreOp) {
          "by load or store op with DPAS layout");
     return true;
   }
-
-  TypedValue<triton::PointerType> base = op.getBase();
-  Operation::operand_range shape = op.getShape();
-  unsigned rank = shape.size();
-  assert(rank > 1 && "Expecting tensor with rank > 1");
-  Operation::operand_range strides = op.getStrides();
-  Operation::operand_range offsets = op.getOffsets();
-  ArrayRef<int32_t> order = op.getOrder();
-  ArrayRef<int64_t> tensorShape = tensorType.getShape();
-
-  int fastChangeDim = -1;
-  for (size_t i = 0; i < strides.size(); ++i) {
-    if (ttgi::isConstant(strides[i], 1)) {
-      fastChangeDim = i;
-      break;
-    }
-  }
-
-  LDBG("fastChangeDim: " << fastChangeDim);
-  if (fastChangeDim < 0) {
-    LDBG("Marked for removal: fast changing dimension not found");
-    return true;
-  }
-
-  LDBG("Tensor type element type bit width: "
-       << tensorType.getElementTypeBitWidth());
-  if (fastChangeDim == rank - 2 && tensorType.getElementTypeBitWidth() == 8) {
-    // TODO: column major layout w/ fp8 has performance regression
-    LDBG("Marked for removal: column major layout with fp8 element type");
-    return true;
-  }
-
-  // HW 2D block read instruction has restriction on pitch divisibility
-  if (fastChangeDim >= (rank - 2)) {
-    auto pitch = strides[(fastChangeDim == rank - 1) ? rank - 2 : rank - 1];
-    LDBG("Pitch: " << pitch);
-    // Across Intel platforms, the strictest pitch restriction is to be a
-    // multiple of OWord(128 bits).
-    if (!ttgi::isDivisible(pitch, 128 / tensorType.getElementTypeBitWidth())) {
-      LDBG("Marked for removal: cannot use block read/write instructions");
-      return true;
-    }
-
-    return false;
-  }
-
-  LDBG("Marked for removal: fall-trough");
-
-  return true;
+  return false;
 }
 
 /// The `RewritedInfo` struct is used to store information about a rewritten
@@ -715,10 +667,19 @@ class TritonIntelGPURewriteTensorPointerPass
   void runOnOperation() override {
     ModuleOp mod = getOperation();
 
-    auto usedByLoadOrStoreOp = [](Value val) {
-      return llvm::any_of(val.getUsers(), [](Operation *user) {
-        return isa<tt::LoadOp, tt::StoreOp>(user);
-      });
+    // TODO: do we need this attribute?
+    auto usedByLoadOrStoreOp = [](Value val,
+                                  const bool check_block_io_attribute = false) {
+      return llvm::any_of(
+          val.getUsers(), [check_block_io_attribute](Operation *user) {
+            const bool is_load_or_store = isa<tt::LoadOp, tt::StoreOp>(user);
+            if (check_block_io_attribute) {
+              return user->hasAttr(
+                  ttgi::TritonIntelGPUDialect::getBlockIOAttrName());
+            } else {
+              return is_load_or_store;
+            }
+          });
     };
 
     auto markTensorPointerForRemoval =
@@ -738,8 +699,17 @@ class TritonIntelGPURewriteTensorPointerPass
         markTensorPointerForRemoval(op->getOperand(0),
                                     isa<tt::LoadOp, tt::StoreOp>(op));
       } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-        for (auto arg : forOp.getInitArgs())
-          markTensorPointerForRemoval(arg);
+        for (auto [arg, blockArg] :
+             llvm::zip(forOp.getInitArgs(),
+                       forOp.getBody()->getArguments().drop_front(
+                           forOp.getNumInductionVars()))) {
+          if (isa<tt::MakeTensorPtrOp>(arg.getDefiningOp())) {
+            constexpr bool check_block_io_attribute = true;
+            markTensorPointerForRemoval(
+                arg.getDefiningOp()->getResult(0),
+                usedByLoadOrStoreOp(blockArg, check_block_io_attribute));
+          }
+        }
       } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
         for (auto operand : yieldOp.getOperands())
           markTensorPointerForRemoval(operand);

From 6b2b526e7d69acaf3a93fa74361dff39c6fc44e8 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Fri, 4 Oct 2024 20:36:27 +0000
Subject: [PATCH 02/19] broken: need to remove every tensor ptr type in the
 chain

---
 .../RewriteTensorPointer.cpp                  | 101 +++++++++++++++---
 1 file changed, 88 insertions(+), 13 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 68ee43e647..c73af83ef3 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -33,7 +33,8 @@ namespace {
 ///   - it does not have Dpas layout or Dot layout (with Dpas layout as parent)
 ///   - its pitch is not divisible by Qword bitwidth
 ///   - it is not contiguous in memory
-bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByLoadOrStoreOp) {
+bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByLoadOrStoreOp,
+                  const bool isUsedByBlockLoadOrStoreOp) {
   LDBG("Considering removal of: " << op);
   if (!op->getParentOfType<ModuleOp>()->hasAttr(
           ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) {
@@ -41,18 +42,27 @@ bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByLoadOrStoreOp) {
     return true;
   }
 
+  if (isUsedByBlockLoadOrStoreOp) {
+    LDBG("Used by block load/store, skipping removal");
+    return false;
+  }
+
   auto ptrType = cast<tt::PointerType>(op.getType());
   LDBG("Op ptr type: " << ptrType);
   auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
   LDBG("Op tensor type: " << tensorType);
+  LDBG("Used by load or store op? " << isUsedByLoadOrStoreOp);
 
-  if (!ttgi::hasDotDpasEncoding(tensorType) &&
-      !(isUsedByLoadOrStoreOp && ttgi::hasDpasEncoding(tensorType))) {
-    LDBG("Marked for removal: tensor doesn't have DPAS layout and is not used "
-         "by load or store op with DPAS layout");
-    return true;
+  if (ttgi::hasDotDpasEncoding(tensorType) &&
+      (isUsedByLoadOrStoreOp && ttgi::hasDpasEncoding(tensorType))) {
+    LDBG("Tensor with DPAS layout is used by load/store op with DPAS layout, "
+         "skipping removal");
+    return false;
   }
-  return false;
+
+  LDBG("Marked for removal: tensor doesn't have DPAS layout and is not used "
+       "by load or store op with DPAS layout");
+  return true;
 }
 
 /// The `RewritedInfo` struct is used to store information about a rewritten
@@ -683,30 +693,93 @@ class TritonIntelGPURewriteTensorPointerPass
     };
 
     auto markTensorPointerForRemoval =
-        [this](Value val, bool isUsedByLoadOrStoreOp = false) {
+        [this](Value val, bool isUsedByLoadOrStoreOp = false,
+               bool isUsedByBlockLoadOrStoreOp = false) {
           if (tt::isTensorPointerType(val.getType())) {
             tt::MakeTensorPtrOp makeTensorPtrOp = getMakeTensorPtrOp(val);
-            if (shouldRemove(makeTensorPtrOp, isUsedByLoadOrStoreOp))
+            if (shouldRemove(makeTensorPtrOp, isUsedByLoadOrStoreOp,
+                             isUsedByBlockLoadOrStoreOp)) {
               valueToRemove.insert(val);
+            }
           }
         };
 
     mod.walk([&](Operation *op) {
       if (isa<tt::MakeTensorPtrOp>(op)) {
+        DenseSet<Operation *> workingSet;
+
+        auto makeTensorPtrOp = dyn_cast<tt::MakeTensorPtrOp>(op);
+        LDBG("Considering: " << *op);
         Value result = op->getResult(0);
-        markTensorPointerForRemoval(result, usedByLoadOrStoreOp(result));
+        for (auto user : result.getUsers()) {
+          workingSet.insert(user); // TODO: safe? need to check ptr?
+        }
+        while (!workingSet.empty()) {
+          for (auto v : workingSet) {
+            LDBG("Working set val: " << *v);
+          }
+          auto crtOpItr = workingSet.begin();
+          auto crtOp = *crtOpItr;
+          LDBG("Processing op: " << *crtOp);
+          if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
+            LDBG("is load store, should remove?");
+            if (shouldRemove(
+                    makeTensorPtrOp, /*isUsedByLoadOrStoreOp=*/true,
+                    /*isBlockLoadOrStore=*/
+                    crtOp->hasAttr(
+                        ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
+              LDBG("Removing: " << result);
+              valueToRemove.insert(result);
+            }
+          } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
+            for (auto [arg, blockArg] :
+                 llvm::zip(forOp.getInitArgs(),
+                           forOp.getBody()->getArguments().drop_front(
+                               forOp.getNumInductionVars()))) {
+              if (arg == makeTensorPtrOp) {
+                // add users of block arg
+                for (auto user : blockArg.getUsers()) {
+                  workingSet.insert(user);
+                }
+              }
+            }
+#if 0
+          } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
+            for (auto operand : yieldOp.getOperands()) {
+              workingSet.insert(operand->getResult(0));
+            }
+#endif
+          } else if (crtOp->getNumResults() > 0) {
+            // TODO: handle more than one result?
+            auto crtOpResult = crtOp->getResult(0);
+            LDBG("Not a load store and not a loop, adding users to working "
+                 "set.");
+            for (auto user : crtOpResult.getUsers()) {
+              workingSet.insert(user);
+            }
+          }
+          workingSet.erase(crtOpItr);
+        }
+#if 1
+      }
+#else
       } else if (isa<tt::AdvanceOp, tt::LoadOp, tt::StoreOp>(op)) {
-        markTensorPointerForRemoval(op->getOperand(0),
-                                    isa<tt::LoadOp, tt::StoreOp>(op));
+        const bool isLoadStoreOp = isa<tt::LoadOp, tt::StoreOp>(op);
+        markTensorPointerForRemoval(
+            op->getOperand(0), isLoadStoreOp,
+            isLoadStoreOp &&
+                op->hasAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName()));
       } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
         for (auto [arg, blockArg] :
              llvm::zip(forOp.getInitArgs(),
                        forOp.getBody()->getArguments().drop_front(
                            forOp.getNumInductionVars()))) {
+          LDBG("arg: " << arg);
           if (isa<tt::MakeTensorPtrOp>(arg.getDefiningOp())) {
             constexpr bool check_block_io_attribute = true;
             markTensorPointerForRemoval(
                 arg.getDefiningOp()->getResult(0),
+                usedByLoadOrStoreOp(blockArg),
                 usedByLoadOrStoreOp(blockArg, check_block_io_attribute));
           }
         }
@@ -714,6 +787,7 @@ class TritonIntelGPURewriteTensorPointerPass
         for (auto operand : yieldOp.getOperands())
           markTensorPointerForRemoval(operand);
       }
+#endif
     });
 
     LLVM_DEBUG({
@@ -722,7 +796,7 @@ class TritonIntelGPURewriteTensorPointerPass
       else {
         DBGS() << "Values to remove: ";
         for (auto val : valueToRemove)
-          DBGS() << val;
+          DBGS() << val << "\n";
       }
     });
 
@@ -746,6 +820,7 @@ class TritonIntelGPURewriteTensorPointerPass
     valueToRemove.clear();
     while (!eraser.empty()) {
       auto op = eraser.top();
+      LDBG("DELETING " << *op);
       eraser.pop();
       op->erase();
     }

From 05cd65ef72ecd92a458db3da4526ea9416ef4442 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Sat, 5 Oct 2024 02:03:43 +0000
Subject: [PATCH 03/19] fixup case 0/1, working on case 2

---
 .../RewriteTensorPointer.cpp                  | 67 ++++++++++++-------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index c73af83ef3..de3d8e8c9d 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -33,8 +33,8 @@ namespace {
 ///   - it does not have Dpas layout or Dot layout (with Dpas layout as parent)
 ///   - its pitch is not divisible by Qword bitwidth
 ///   - it is not contiguous in memory
-bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByLoadOrStoreOp,
-                  const bool isUsedByBlockLoadOrStoreOp) {
+bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByStoreOp,
+                  const bool isUsedByBlockLoadOp) {
   LDBG("Considering removal of: " << op);
   if (!op->getParentOfType<ModuleOp>()->hasAttr(
           ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) {
@@ -42,20 +42,17 @@ bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByLoadOrStoreOp,
     return true;
   }
 
-  if (isUsedByBlockLoadOrStoreOp) {
-    LDBG("Used by block load/store, skipping removal");
-    return false;
-  }
-
   auto ptrType = cast<tt::PointerType>(op.getType());
   LDBG("Op ptr type: " << ptrType);
   auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
   LDBG("Op tensor type: " << tensorType);
-  LDBG("Used by load or store op? " << isUsedByLoadOrStoreOp);
+  LDBG("Used by store op? " << isUsedByStoreOp);
+  LDBG("Used by block load op? " << isUsedByBlockLoadOp);
 
-  if (ttgi::hasDotDpasEncoding(tensorType) &&
-      (isUsedByLoadOrStoreOp && ttgi::hasDpasEncoding(tensorType))) {
-    LDBG("Tensor with DPAS layout is used by load/store op with DPAS layout, "
+  LDBG("hasDotDpasEncoding: " << ttgi::hasDotDpasEncoding(tensorType));
+  LDBG("hasDpasEncoding: " << ttgi::hasDpasEncoding(tensorType));
+  if (ttgi::hasDotDpasEncoding(tensorType) || isUsedByBlockLoadOp || (isUsedByStoreOp && ttgi::hasDpasEncoding(tensorType))) {
+    LDBG("Tensor has DPAS layout or is used by load/store op with DPAS layout, "
          "skipping removal");
     return false;
   }
@@ -692,18 +689,9 @@ class TritonIntelGPURewriteTensorPointerPass
           });
     };
 
-    auto markTensorPointerForRemoval =
-        [this](Value val, bool isUsedByLoadOrStoreOp = false,
-               bool isUsedByBlockLoadOrStoreOp = false) {
-          if (tt::isTensorPointerType(val.getType())) {
-            tt::MakeTensorPtrOp makeTensorPtrOp = getMakeTensorPtrOp(val);
-            if (shouldRemove(makeTensorPtrOp, isUsedByLoadOrStoreOp,
-                             isUsedByBlockLoadOrStoreOp)) {
-              valueToRemove.insert(val);
-            }
-          }
-        };
 
+    // TODO: this is working, but materialize block pointer needs to 
+    DenseSet<Operation *> tensorPointersToRemove;
     mod.walk([&](Operation *op) {
       if (isa<tt::MakeTensorPtrOp>(op)) {
         DenseSet<Operation *> workingSet;
@@ -724,12 +712,12 @@ class TritonIntelGPURewriteTensorPointerPass
           if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
             LDBG("is load store, should remove?");
             if (shouldRemove(
-                    makeTensorPtrOp, /*isUsedByLoadOrStoreOp=*/true,
-                    /*isBlockLoadOrStore=*/
-                    crtOp->hasAttr(
+                    makeTensorPtrOp, /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
+                    /*isBlockLoad=*/
+                    isa<tt::LoadOp>(crtOp) && crtOp->hasAttr(
                         ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
               LDBG("Removing: " << result);
-              valueToRemove.insert(result);
+              tensorPointersToRemove.insert(makeTensorPtrOp);
             }
           } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
             for (auto [arg, blockArg] :
@@ -790,6 +778,33 @@ class TritonIntelGPURewriteTensorPointerPass
 #endif
     });
 
+    auto markTensorPointerForRemoval =
+        [this, &tensorPointersToRemove](Value val, bool isUsedByLoadOrStoreOp = false,
+               bool isUsedByBlockLoadOrStoreOp = false) {
+          if (tt::isTensorPointerType(val.getType())) {
+            tt::MakeTensorPtrOp makeTensorPtrOp = getMakeTensorPtrOp(val);
+            if (tensorPointersToRemove.count(makeTensorPtrOp)) {
+              valueToRemove.insert(val);
+            }
+          }
+        };
+
+    mod.walk([&](Operation *op) {
+      if (isa<tt::MakeTensorPtrOp>(op)) {
+        Value result = op->getResult(0);
+        markTensorPointerForRemoval(result, usedByLoadOrStoreOp(result));
+      } else if (isa<tt::AdvanceOp, tt::LoadOp, tt::StoreOp>(op)) {
+        markTensorPointerForRemoval(op->getOperand(0),
+                                    isa<tt::LoadOp, tt::StoreOp>(op));
+      } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+        for (auto arg : forOp.getInitArgs())
+          markTensorPointerForRemoval(arg);
+      } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
+        for (auto operand : yieldOp.getOperands())
+          markTensorPointerForRemoval(operand);
+      }
+    });
+
     LLVM_DEBUG({
       if (valueToRemove.empty())
         DBGS() << "No tensor pointer to remove";

From 7bd98ee93ef0fee736fef2faa23911c074dad70b Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 7 Oct 2024 12:47:00 +0000
Subject: [PATCH 04/19] modify rewrit tensor pointer test to rely on block io
 tag from materialize block ptr

---
 .../rewrite-tensor-pointer.mlir               | 79 ++-----------------
 1 file changed, 6 insertions(+), 73 deletions(-)

diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
index 761c827172..c88f0f900c 100644
--- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
+++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -44,10 +44,10 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
     // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
     %22 = tt.make_tensor_ptr %arg1, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #dot1>>
     %23:3 = scf.for %arg10 = %c0_i32 to %arg6 step %c32_i32 iter_args(%arg11 = %cst, %arg12 = %18, %arg13 = %22) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #dot0>>, !tt.ptr<tensor<32x256xf16, #dot1>>)  : i32 {
-      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
-      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
-      %28 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #dot0>>
-      %29 = tt.load %arg13 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #dot1>>
+      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+      %28 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<256x32xf16, #dot0>>
+      %29 = tt.load %arg13 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<32x256xf16, #dot1>>
       // CHECK:  tt.dot {{.*}}, {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>> * tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>> -> tensor<256x256xf32, #[[DPAS]]>
       // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
       // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
@@ -59,8 +59,8 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
     %25 = arith.extsi %arg9 : i32 to i64
     // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<256x256xf32, #[[DPAS]]>>
     %26 = tt.make_tensor_ptr %arg3, [%15, %20], [%25, %c1_i64], [%14, %19] {order = array<i32: 1, 0>} : <tensor<256x256xf32, #dpas>>
-    // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf32, #[[DPAS]]>>
-    %27 = tt.load %26 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf32, #dpas>>
+    // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<256x256xf32, #[[DPAS]]>>
+    %27 = tt.load %26 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<256x256xf32, #dpas>>
     %28 = arith.addf %23#0, %27 : tensor<256x256xf32, #dpas>
     %29 = arith.truncf %28 : tensor<256x256xf32, #dpas> to tensor<256x256xf16, #dpas>
 
@@ -150,73 +150,6 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
 
 // -----
 
-// COM: Case 2:
-// COM: Check that operations using block pointers without divisibility attribute are rewritten to use a legacy pointer.
-// CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
-#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 16], order = [1, 0]}>
-#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
-#dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
-#dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
-  tt.func public @matmul_kernel_with_block_pointers_indivisible(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}) {
-    // CHECK:  @matmul_kernel_with_block_pointers_indivisible
-    %c4_i32 = arith.constant 4 : i32
-    %c256_i32 = arith.constant 256 : i32
-    %c1_i64 = arith.constant 1 : i64
-    %c0_i32 = arith.constant 0 : i32
-    %c32_i32 = arith.constant 32 : i32
-    %c255_i32 = arith.constant 255 : i32
-    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #dpas>
-    %0 = tt.get_program_id x : i32
-    %1 = arith.addi %arg3, %c255_i32 : i32
-    %2 = arith.divsi %1, %c256_i32 : i32
-    %3 = arith.addi %arg4, %c255_i32 : i32
-    %4 = arith.divsi %3, %c256_i32 : i32
-    %5 = arith.muli %4, %c4_i32 : i32
-    %6 = arith.divsi %0, %5 : i32
-    %7 = arith.muli %6, %c4_i32 : i32
-    %8 = arith.subi %2, %7 : i32
-    %9 = arith.minsi %8, %c4_i32 : i32
-    %10 = arith.remsi %0, %9 : i32
-    %11 = arith.addi %7, %10 : i32
-    %12 = arith.remsi %0, %5 : i32
-    %13 = arith.divsi %12, %9 : i32
-    %14 = arith.muli %11, %c256_i32 : i32
-    %15 = arith.extsi %arg3 : i32 to i64
-    %16 = arith.extsi %arg5 : i32 to i64
-    %17 = arith.extsi %arg6 : i32 to i64
-    // CHECK-NOT: tt.make_tensor_ptr
-    %18 = tt.make_tensor_ptr %arg0, [%15, %16], [%17, %c1_i64], [%14, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #dot0>>
-    %19 = arith.muli %13, %c256_i32 : i32
-    %20 = arith.extsi %arg4 : i32 to i64
-    %21 = arith.extsi %arg7 : i32 to i64
-    // CHECK-NOT: tt.make_tensor_ptr
-    %22 = tt.make_tensor_ptr %arg1, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #dot1>>
-    %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #dot0>>, !tt.ptr<tensor<32x256xf16, #dot1>>) : i32 {
-      // CHECK: tt.load {{.*}}, {{.*}} : tensor<256x32x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>
-      // CHECK: tt.load {{.*}}, {{.*}} : tensor<32x256x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>
-      %28 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #dot0>>
-      %29 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #dot1>>
-      %30 = tt.dot %28, %29, %arg10, inputPrecision = tf32 : tensor<256x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<256x256xf32, #dpas>
-      // CHECK-NOT: tt.advance
-      %31 = tt.advance %arg11, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #dot0>>
-      // CHECK-NOT: tt.advance
-      %32 = tt.advance %arg12, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #dot1>>
-      scf.yield %30, %31, %32 : tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #dot0>>, !tt.ptr<tensor<32x256xf16, #dot1>>
-    }
-    %24 = arith.truncf %23#0 : tensor<256x256xf32, #dpas> to tensor<256x256xf16, #dpas>
-    %25 = triton_gpu.convert_layout %24 : tensor<256x256xf16, #dpas> -> tensor<256x256xf16, #blocked>
-    %26 = arith.extsi %arg8 : i32 to i64
-    // CHECK-NOT: tt.make_tensor_ptr
-    %27 = tt.make_tensor_ptr %arg2, [%15, %20], [%26, %c1_i64], [%14, %19] {order = array<i32: 1, 0>} : <tensor<256x256xf16, #blocked>>
-    // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : tensor<256x256x!tt.ptr<f16>, #[[BLOCKED]]>
-    tt.store %27, %25 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf16, #blocked>>
-    tt.return
-  }
-}
-
-// -----
-
 // COM: Case 3:
 // COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer.
 module attributes {"triton_intel_gpu.support_sg_2d_block"} {

From 3df402f27efbff773349d6069b0fb7859461fda4 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 8 Oct 2024 01:27:04 +0000
Subject: [PATCH 05/19] move fp8 regression fix to materialize block ptr

---
 .../lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp | 5 +++++
 .../lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp    | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
index 8361675b55..a9a43989f4 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -71,6 +71,11 @@ struct TritonIntelGPUMaterializeBlockPointerPass
         return;
       }
 
+      if (fastChangeDim == rank - 2 && tensorType.getElementTypeBitWidth() == 8) {
+        // TODO: column major layout w/ fp8 has performance regression
+        return;
+      }
+
       if (fastChangeDim >= (rank - 2)) {
         // HW 2D block read instruction only supports contiguous access.
         Value fastChangeStride = strides[fastChangeDim];
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index de3d8e8c9d..f704e446a6 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -51,7 +51,7 @@ bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByStoreOp,
 
   LDBG("hasDotDpasEncoding: " << ttgi::hasDotDpasEncoding(tensorType));
   LDBG("hasDpasEncoding: " << ttgi::hasDpasEncoding(tensorType));
-  if (ttgi::hasDotDpasEncoding(tensorType) || isUsedByBlockLoadOp || (isUsedByStoreOp && ttgi::hasDpasEncoding(tensorType))) {
+  if (/*ttgi::hasDotDpasEncoding(tensorType) ||*/ isUsedByBlockLoadOp || (isUsedByStoreOp && ttgi::hasDpasEncoding(tensorType))) {
     LDBG("Tensor has DPAS layout or is used by load/store op with DPAS layout, "
          "skipping removal");
     return false;

From 57d39cefb966bf61b4970534dc1611d0e038cf2c Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 8 Oct 2024 01:37:24 +0000
Subject: [PATCH 06/19] format and cleanups

---
 .../MaterializeBlockPointer.cpp               |  3 ++-
 .../RewriteTensorPointer.cpp                  | 26 +++++++++----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
index a9a43989f4..9a0b5e4f9d 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -71,7 +71,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
         return;
       }
 
-      if (fastChangeDim == rank - 2 && tensorType.getElementTypeBitWidth() == 8) {
+      if (fastChangeDim == rank - 2 &&
+          tensorType.getElementTypeBitWidth() == 8) {
         // TODO: column major layout w/ fp8 has performance regression
         return;
       }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index f704e446a6..dba4a76d0a 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -49,16 +49,16 @@ bool shouldRemove(tt::MakeTensorPtrOp &op, const bool isUsedByStoreOp,
   LDBG("Used by store op? " << isUsedByStoreOp);
   LDBG("Used by block load op? " << isUsedByBlockLoadOp);
 
-  LDBG("hasDotDpasEncoding: " << ttgi::hasDotDpasEncoding(tensorType));
   LDBG("hasDpasEncoding: " << ttgi::hasDpasEncoding(tensorType));
-  if (/*ttgi::hasDotDpasEncoding(tensorType) ||*/ isUsedByBlockLoadOp || (isUsedByStoreOp && ttgi::hasDpasEncoding(tensorType))) {
+  if (isUsedByBlockLoadOp ||
+      (isUsedByStoreOp && ttgi::hasDpasEncoding(tensorType))) {
     LDBG("Tensor has DPAS layout or is used by load/store op with DPAS layout, "
          "skipping removal");
     return false;
   }
 
-  LDBG("Marked for removal: tensor doesn't have DPAS layout and is not used "
-       "by load or store op with DPAS layout");
+  LDBG("Marked for removal: make tensor ptr op is not used by block load op or "
+       "by store op with DPAS layout");
   return true;
 }
 
@@ -689,8 +689,6 @@ class TritonIntelGPURewriteTensorPointerPass
           });
     };
 
-
-    // TODO: this is working, but materialize block pointer needs to 
     DenseSet<Operation *> tensorPointersToRemove;
     mod.walk([&](Operation *op) {
       if (isa<tt::MakeTensorPtrOp>(op)) {
@@ -711,11 +709,12 @@ class TritonIntelGPURewriteTensorPointerPass
           LDBG("Processing op: " << *crtOp);
           if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
             LDBG("is load store, should remove?");
-            if (shouldRemove(
-                    makeTensorPtrOp, /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
-                    /*isBlockLoad=*/
-                    isa<tt::LoadOp>(crtOp) && crtOp->hasAttr(
-                        ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
+            if (shouldRemove(makeTensorPtrOp,
+                             /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
+                             /*isBlockLoad=*/
+                             isa<tt::LoadOp>(crtOp) &&
+                                 crtOp->hasAttr(ttgi::TritonIntelGPUDialect::
+                                                    getBlockIOAttrName()))) {
               LDBG("Removing: " << result);
               tensorPointersToRemove.insert(makeTensorPtrOp);
             }
@@ -779,8 +778,9 @@ class TritonIntelGPURewriteTensorPointerPass
     });
 
     auto markTensorPointerForRemoval =
-        [this, &tensorPointersToRemove](Value val, bool isUsedByLoadOrStoreOp = false,
-               bool isUsedByBlockLoadOrStoreOp = false) {
+        [this,
+         &tensorPointersToRemove](Value val, bool isUsedByLoadOrStoreOp = false,
+                                  bool isUsedByBlockLoadOrStoreOp = false) {
           if (tt::isTensorPointerType(val.getType())) {
             tt::MakeTensorPtrOp makeTensorPtrOp = getMakeTensorPtrOp(val);
             if (tensorPointersToRemove.count(makeTensorPtrOp)) {

From d9b581786e6fcbed5fb62592f402a8f6ece96d13 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 8 Oct 2024 01:41:34 +0000
Subject: [PATCH 07/19] format and cleanup

---
 .../RewriteTensorPointer.cpp                  | 85 ++++---------------
 1 file changed, 16 insertions(+), 69 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index dba4a76d0a..c726d0fc4b 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -674,21 +674,6 @@ class TritonIntelGPURewriteTensorPointerPass
   void runOnOperation() override {
     ModuleOp mod = getOperation();
 
-    // TODO: do we need this attribute?
-    auto usedByLoadOrStoreOp = [](Value val,
-                                  const bool check_block_io_attribute = false) {
-      return llvm::any_of(
-          val.getUsers(), [check_block_io_attribute](Operation *user) {
-            const bool is_load_or_store = isa<tt::LoadOp, tt::StoreOp>(user);
-            if (check_block_io_attribute) {
-              return user->hasAttr(
-                  ttgi::TritonIntelGPUDialect::getBlockIOAttrName());
-            } else {
-              return is_load_or_store;
-            }
-          });
-    };
-
     DenseSet<Operation *> tensorPointersToRemove;
     mod.walk([&](Operation *op) {
       if (isa<tt::MakeTensorPtrOp>(op)) {
@@ -698,24 +683,22 @@ class TritonIntelGPURewriteTensorPointerPass
         LDBG("Considering: " << *op);
         Value result = op->getResult(0);
         for (auto user : result.getUsers()) {
-          workingSet.insert(user); // TODO: safe? need to check ptr?
+          workingSet.insert(user);
         }
         while (!workingSet.empty()) {
-          for (auto v : workingSet) {
-            LDBG("Working set val: " << *v);
-          }
           auto crtOpItr = workingSet.begin();
           auto crtOp = *crtOpItr;
           LDBG("Processing op: " << *crtOp);
           if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
-            LDBG("is load store, should remove?");
+            LDBG("is load store, checking to see if we should remove make "
+                 "tensor ptr op");
             if (shouldRemove(makeTensorPtrOp,
                              /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
                              /*isBlockLoad=*/
                              isa<tt::LoadOp>(crtOp) &&
                                  crtOp->hasAttr(ttgi::TritonIntelGPUDialect::
                                                     getBlockIOAttrName()))) {
-              LDBG("Removing: " << result);
+              LDBG("Marking op for removal: " << result);
               tensorPointersToRemove.insert(makeTensorPtrOp);
             }
           } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
@@ -730,14 +713,8 @@ class TritonIntelGPURewriteTensorPointerPass
                 }
               }
             }
-#if 0
-          } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
-            for (auto operand : yieldOp.getOperands()) {
-              workingSet.insert(operand->getResult(0));
-            }
-#endif
           } else if (crtOp->getNumResults() > 0) {
-            // TODO: handle more than one result?
+            // TODO: should we handle more than one result?
             auto crtOpResult = crtOp->getResult(0);
             LDBG("Not a load store and not a loop, adding users to working "
                  "set.");
@@ -747,55 +724,25 @@ class TritonIntelGPURewriteTensorPointerPass
           }
           workingSet.erase(crtOpItr);
         }
-#if 1
-      }
-#else
-      } else if (isa<tt::AdvanceOp, tt::LoadOp, tt::StoreOp>(op)) {
-        const bool isLoadStoreOp = isa<tt::LoadOp, tt::StoreOp>(op);
-        markTensorPointerForRemoval(
-            op->getOperand(0), isLoadStoreOp,
-            isLoadStoreOp &&
-                op->hasAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName()));
-      } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-        for (auto [arg, blockArg] :
-             llvm::zip(forOp.getInitArgs(),
-                       forOp.getBody()->getArguments().drop_front(
-                           forOp.getNumInductionVars()))) {
-          LDBG("arg: " << arg);
-          if (isa<tt::MakeTensorPtrOp>(arg.getDefiningOp())) {
-            constexpr bool check_block_io_attribute = true;
-            markTensorPointerForRemoval(
-                arg.getDefiningOp()->getResult(0),
-                usedByLoadOrStoreOp(blockArg),
-                usedByLoadOrStoreOp(blockArg, check_block_io_attribute));
-          }
-        }
-      } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
-        for (auto operand : yieldOp.getOperands())
-          markTensorPointerForRemoval(operand);
       }
-#endif
     });
 
-    auto markTensorPointerForRemoval =
-        [this,
-         &tensorPointersToRemove](Value val, bool isUsedByLoadOrStoreOp = false,
-                                  bool isUsedByBlockLoadOrStoreOp = false) {
-          if (tt::isTensorPointerType(val.getType())) {
-            tt::MakeTensorPtrOp makeTensorPtrOp = getMakeTensorPtrOp(val);
-            if (tensorPointersToRemove.count(makeTensorPtrOp)) {
-              valueToRemove.insert(val);
-            }
-          }
-        };
+    auto markTensorPointerForRemoval = [this,
+                                        &tensorPointersToRemove](Value val) {
+      if (tt::isTensorPointerType(val.getType())) {
+        tt::MakeTensorPtrOp makeTensorPtrOp = getMakeTensorPtrOp(val);
+        if (tensorPointersToRemove.count(makeTensorPtrOp)) {
+          valueToRemove.insert(val);
+        }
+      }
+    };
 
     mod.walk([&](Operation *op) {
       if (isa<tt::MakeTensorPtrOp>(op)) {
         Value result = op->getResult(0);
-        markTensorPointerForRemoval(result, usedByLoadOrStoreOp(result));
+        markTensorPointerForRemoval(result);
       } else if (isa<tt::AdvanceOp, tt::LoadOp, tt::StoreOp>(op)) {
-        markTensorPointerForRemoval(op->getOperand(0),
-                                    isa<tt::LoadOp, tt::StoreOp>(op));
+        markTensorPointerForRemoval(op->getOperand(0));
       } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
         for (auto arg : forOp.getInitArgs())
           markTensorPointerForRemoval(arg);

From d039975ca502da7721eb6b482be2b1f7be52aea5 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 8 Oct 2024 01:43:15 +0000
Subject: [PATCH 08/19] "final" cleanup :)

---
 .../lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index c726d0fc4b..2b3bfc7723 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -690,15 +690,12 @@ class TritonIntelGPURewriteTensorPointerPass
           auto crtOp = *crtOpItr;
           LDBG("Processing op: " << *crtOp);
           if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
-            LDBG("is load store, checking to see if we should remove make "
-                 "tensor ptr op");
             if (shouldRemove(makeTensorPtrOp,
                              /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
                              /*isBlockLoad=*/
                              isa<tt::LoadOp>(crtOp) &&
                                  crtOp->hasAttr(ttgi::TritonIntelGPUDialect::
                                                     getBlockIOAttrName()))) {
-              LDBG("Marking op for removal: " << result);
               tensorPointersToRemove.insert(makeTensorPtrOp);
             }
           } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
@@ -782,7 +779,6 @@ class TritonIntelGPURewriteTensorPointerPass
     valueToRemove.clear();
     while (!eraser.empty()) {
       auto op = eraser.top();
-      LDBG("DELETING " << *op);
       eraser.pop();
       op->erase();
     }

From 0a1cf46b9de29d30361f643e3bb83e80aa171965 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 8 Oct 2024 02:35:34 +0000
Subject: [PATCH 09/19] fixup and re-add lit test

---
 .../rewrite-tensor-pointer.mlir               | 75 ++++++++++++++++++-
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
index c88f0f900c..2e88060686 100644
--- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
+++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -125,10 +125,10 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
     // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
     %22 = tt.make_tensor_ptr %arg1, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #dot1>>
     %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #dot0>>, !tt.ptr<tensor<32x256xf16, #dot1>>) : i32 {
-      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
-      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
-      %28 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #dot0>>
-      %29 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #dot1>>
+      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>,  triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+      // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0, 1>,  triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+      %28 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>,  triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<256x32xf16, #dot0>>
+      %29 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>,  triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<32x256xf16, #dot1>>
       // CHECK:  tt.dot {{.*}}, {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>> * tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>> -> tensor<256x256xf32, #[[DPAS]]>
       // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
       // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
@@ -150,6 +150,73 @@ module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-wa
 
 // -----
 
+// COM: Case 2:
+// COM: Check that operations using block pointers without divisibility attribute are rewritten to use a legacy pointer.
+// CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 16], order = [1, 0]}>
+#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+#dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
+#dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
+module attributes {"triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
+  tt.func public @matmul_kernel_with_block_pointers_indivisible(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}) {
+    // CHECK:  @matmul_kernel_with_block_pointers_indivisible
+    %c4_i32 = arith.constant 4 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c255_i32 = arith.constant 255 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #dpas>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.addi %arg3, %c255_i32 : i32
+    %2 = arith.divsi %1, %c256_i32 : i32
+    %3 = arith.addi %arg4, %c255_i32 : i32
+    %4 = arith.divsi %3, %c256_i32 : i32
+    %5 = arith.muli %4, %c4_i32 : i32
+    %6 = arith.divsi %0, %5 : i32
+    %7 = arith.muli %6, %c4_i32 : i32
+    %8 = arith.subi %2, %7 : i32
+    %9 = arith.minsi %8, %c4_i32 : i32
+    %10 = arith.remsi %0, %9 : i32
+    %11 = arith.addi %7, %10 : i32
+    %12 = arith.remsi %0, %5 : i32
+    %13 = arith.divsi %12, %9 : i32
+    %14 = arith.muli %11, %c256_i32 : i32
+    %15 = arith.extsi %arg3 : i32 to i64
+    %16 = arith.extsi %arg5 : i32 to i64
+    %17 = arith.extsi %arg6 : i32 to i64
+    // CHECK-NOT: tt.make_tensor_ptr
+    %18 = tt.make_tensor_ptr %arg0, [%15, %16], [%17, %c1_i64], [%14, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #dot0>>
+    %19 = arith.muli %13, %c256_i32 : i32
+    %20 = arith.extsi %arg4 : i32 to i64
+    %21 = arith.extsi %arg7 : i32 to i64
+    // CHECK-NOT: tt.make_tensor_ptr
+    %22 = tt.make_tensor_ptr %arg1, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #dot1>>
+    %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #dot0>>, !tt.ptr<tensor<32x256xf16, #dot1>>) : i32 {
+      // CHECK: tt.load {{.*}}, {{.*}} : tensor<256x32x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>
+      // CHECK: tt.load {{.*}}, {{.*}} : tensor<32x256x!tt.ptr<f16>, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>
+      %28 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf16, #dot0>>
+      %29 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #dot1>>
+      %30 = tt.dot %28, %29, %arg10, inputPrecision = tf32 : tensor<256x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<256x256xf32, #dpas>
+      // CHECK-NOT: tt.advance
+      %31 = tt.advance %arg11, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #dot0>>
+      // CHECK-NOT: tt.advance
+      %32 = tt.advance %arg12, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #dot1>>
+      scf.yield %30, %31, %32 : tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #dot0>>, !tt.ptr<tensor<32x256xf16, #dot1>>
+    }
+    %24 = arith.truncf %23#0 : tensor<256x256xf32, #dpas> to tensor<256x256xf16, #dpas>
+    %25 = triton_gpu.convert_layout %24 : tensor<256x256xf16, #dpas> -> tensor<256x256xf16, #blocked>
+    %26 = arith.extsi %arg8 : i32 to i64
+    // CHECK-NOT: tt.make_tensor_ptr
+    %27 = tt.make_tensor_ptr %arg2, [%15, %20], [%26, %c1_i64], [%14, %19] {order = array<i32: 1, 0>} : <tensor<256x256xf16, #blocked>>
+    // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : tensor<256x256x!tt.ptr<f16>, #[[BLOCKED]]>
+    tt.store %27, %25 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf16, #blocked>>
+    tt.return
+  }
+}
+
+// -----
+
 // COM: Case 3:
 // COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer.
 module attributes {"triton_intel_gpu.support_sg_2d_block"} {

From 0b9322c0945dde4ce7b0abe01aa38f3b275872ed Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 8 Oct 2024 14:11:54 +0000
Subject: [PATCH 10/19] skip test_trans_reshape as block ptrs are expected to
 be preserved

---
 python/test/unit/language/test_core.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index c0b84f53c7..ddddaa6dce 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -4262,6 +4262,11 @@ def kernel():
 
 def test_trans_reshape(device):
 
+    if is_xpu():
+        pytest.skip(
+            "test_trans_reshape: XPU block pointer implementation preserves the block pointer load for eventual lowering to 2D block read, does not convert layouts in TTGIR"
+        )
+
     @triton.jit
     def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.constexpr):
 

From a156aec09c3b51ef093e0877cde8bfa387a02edc Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 9 Oct 2024 19:55:50 +0000
Subject: [PATCH 11/19] run test trans reshape but ensure that no convert
 layouts exist on xpu

---
 python/test/unit/language/test_core.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index ddddaa6dce..11e357ee5c 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -4262,11 +4262,6 @@ def kernel():
 
 def test_trans_reshape(device):
 
-    if is_xpu():
-        pytest.skip(
-            "test_trans_reshape: XPU block pointer implementation preserves the block pointer load for eventual lowering to 2D block read, does not convert layouts in TTGIR"
-        )
-
     @triton.jit
     def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.constexpr):
 
@@ -4291,8 +4286,12 @@ def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.con
     actual = torch.zeros(expected.shape, dtype=torch.int32, device=device)
 
     k = kernel[(1, )](input, actual, shape[0], shape[1])
-    assert k.asm['ttgir'].count(
-        'triton_gpu.convert_layout') == 1, "Expected exactly one convert_layout op in the TTGIR after optimization"
+    if is_xpu():
+        assert k.asm['ttgir'].count(
+            'triton_gpu.convert_layout') == 0, "Expected no convert_layout op in the TTGIR after optimization"
+    else:
+        assert k.asm['ttgir'].count(
+            'triton_gpu.convert_layout') == 1, "Expected exactly one convert_layout op in the TTGIR after optimization"
 
     np.testing.assert_equal(to_numpy(expected), to_numpy(actual))
 

From 6ef4ef420695c88464b82443755ba5d48c0170eb Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 9 Oct 2024 20:27:52 +0000
Subject: [PATCH 12/19] Revert "run test trans reshape but ensure that no
 convert layouts exist on xpu"

This reverts commit ccb395371362b8dff16b044f3852d35a3776b449.

Does not work on LTS currently, as we are removing tensor pointers on LTS and producing different TTGIR.
---
 python/test/unit/language/test_core.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 11e357ee5c..ddddaa6dce 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -4262,6 +4262,11 @@ def kernel():
 
 def test_trans_reshape(device):
 
+    if is_xpu():
+        pytest.skip(
+            "test_trans_reshape: XPU block pointer implementation preserves the block pointer load for eventual lowering to 2D block read, does not convert layouts in TTGIR"
+        )
+
     @triton.jit
     def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.constexpr):
 
@@ -4286,12 +4291,8 @@ def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.con
     actual = torch.zeros(expected.shape, dtype=torch.int32, device=device)
 
     k = kernel[(1, )](input, actual, shape[0], shape[1])
-    if is_xpu():
-        assert k.asm['ttgir'].count(
-            'triton_gpu.convert_layout') == 0, "Expected no convert_layout op in the TTGIR after optimization"
-    else:
-        assert k.asm['ttgir'].count(
-            'triton_gpu.convert_layout') == 1, "Expected exactly one convert_layout op in the TTGIR after optimization"
+    assert k.asm['ttgir'].count(
+        'triton_gpu.convert_layout') == 1, "Expected exactly one convert_layout op in the TTGIR after optimization"
 
     np.testing.assert_equal(to_numpy(expected), to_numpy(actual))
 

From fcf27d66583095686e32ddc23627367743f543d3 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 9 Oct 2024 20:48:30 +0000
Subject: [PATCH 13/19] only skip the asm check

---
 python/test/unit/language/test_core.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index ddddaa6dce..98d7033fb6 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -4262,11 +4262,6 @@ def kernel():
 
 def test_trans_reshape(device):
 
-    if is_xpu():
-        pytest.skip(
-            "test_trans_reshape: XPU block pointer implementation preserves the block pointer load for eventual lowering to 2D block read, does not convert layouts in TTGIR"
-        )
-
     @triton.jit
     def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.constexpr):
 
@@ -4291,8 +4286,9 @@ def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.con
     actual = torch.zeros(expected.shape, dtype=torch.int32, device=device)
 
     k = kernel[(1, )](input, actual, shape[0], shape[1])
-    assert k.asm['ttgir'].count(
-        'triton_gpu.convert_layout') == 1, "Expected exactly one convert_layout op in the TTGIR after optimization"
+    if not is_xpu():
+        assert k.asm['ttgir'].count(
+            'triton_gpu.convert_layout') == 1, "Expected exactly one convert_layout op in the TTGIR after optimization"
 
     np.testing.assert_equal(to_numpy(expected), to_numpy(actual))
 

From 65781aa10a25c9b519f96b87586408f004c59b58 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 9 Oct 2024 21:28:02 +0000
Subject: [PATCH 14/19] address review comments

---
 .../rewrite-tensor-pointer.mlir               | 50 ++++++++++++
 .../RewriteTensorPointer.cpp                  | 81 +++++++++----------
 2 files changed, 90 insertions(+), 41 deletions(-)

diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
index 2e88060686..ae9c269bc7 100644
--- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
+++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -335,3 +335,53 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
     tt.return
   }
 }
+
+// -----
+
+// COM: Case 5:
+// COM: Check that a make tensor ptr with no loads is handled properly
+// CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    // CHECK:  @matmul_kernel_with_block_pointers
+    %c4_i32 = arith.constant 4 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %c1024_i64 = arith.constant 1024 : i64
+    %c5120_i64 = arith.constant 5120 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %c4096_i64 = arith.constant 4096 : i64
+    %c32_i32 = arith.constant 32 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c5120_i32 = arith.constant 5120 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #dpas>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.divsi %0, %c64_i32 : i32
+    %2 = arith.muli %1, %c4_i32 : i32
+    %3 = arith.subi %c4_i32, %2 : i32
+    %4 = arith.minsi %3, %c4_i32 : i32
+    %5 = arith.remsi %0, %4 : i32
+    %6 = arith.addi %2, %5 : i32
+    %7 = arith.remsi %0, %c64_i32 : i32
+    %8 = arith.divsi %7, %4 : i32
+    %9 = arith.muli %6, %c256_i32 : i32
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+    %10 = tt.make_tensor_ptr %arg0, [%c1024_i64, %c5120_i64], [%c5120_i64, %c1_i64], [%9, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>
+    %11 = arith.muli %8, %c256_i32 : i32
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+    %12 = tt.make_tensor_ptr %arg1, [%c5120_i64, %c4096_i64], [%c1_i64, %c5120_i64], [%c0_i32, %11] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
+    %13:3 = scf.for %arg3 = %c0_i32 to %c5120_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %10, %arg6 = %12) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>)  : i32 {
+      // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+      // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+      %19 = tt.advance %arg5, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>
+      %20 = tt.advance %arg6, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
+      scf.yield %arg4, %19, %20 : tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
+    }
+    %14 = tt.make_tensor_ptr %arg2, [%c1024_i64, %c4096_i64], [%c4096_i64, %c1_i64], [%9, %11] {order = array<i32: 1, 0>} : <tensor<256x256xf16, #dpas>>
+    %15 = arith.truncf %13#0 : tensor<256x256xf32, #dpas> to tensor<256x256xf16, #dpas>
+    // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : !tt.ptr<tensor<256x256xf16, #[[DPAS]]>
+    tt.store %14, %15 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf16, #dpas>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 2b3bfc7723..81ed0c89d1 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -675,52 +675,51 @@ class TritonIntelGPURewriteTensorPointerPass
     ModuleOp mod = getOperation();
 
     DenseSet<Operation *> tensorPointersToRemove;
-    mod.walk([&](Operation *op) {
-      if (isa<tt::MakeTensorPtrOp>(op)) {
-        DenseSet<Operation *> workingSet;
+    mod.walk([&](tt::MakeTensorPtrOp makeTensorPtrOp) {
+      DenseSet<Operation *> workingSet;
 
-        auto makeTensorPtrOp = dyn_cast<tt::MakeTensorPtrOp>(op);
-        LDBG("Considering: " << *op);
-        Value result = op->getResult(0);
-        for (auto user : result.getUsers()) {
-          workingSet.insert(user);
-        }
-        while (!workingSet.empty()) {
-          auto crtOpItr = workingSet.begin();
-          auto crtOp = *crtOpItr;
-          LDBG("Processing op: " << *crtOp);
-          if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
-            if (shouldRemove(makeTensorPtrOp,
-                             /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
-                             /*isBlockLoad=*/
-                             isa<tt::LoadOp>(crtOp) &&
-                                 crtOp->hasAttr(ttgi::TritonIntelGPUDialect::
-                                                    getBlockIOAttrName()))) {
-              tensorPointersToRemove.insert(makeTensorPtrOp);
-            }
-          } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
-            for (auto [arg, blockArg] :
-                 llvm::zip(forOp.getInitArgs(),
-                           forOp.getBody()->getArguments().drop_front(
-                               forOp.getNumInductionVars()))) {
-              if (arg == makeTensorPtrOp) {
-                // add users of block arg
-                for (auto user : blockArg.getUsers()) {
-                  workingSet.insert(user);
-                }
+      LDBG("Considering: " << makeTensorPtrOp);
+      Value result = makeTensorPtrOp.getResult();
+      for (auto user : result.getUsers()) {
+        workingSet.insert(user);
+      }
+      while (!workingSet.empty()) {
+        auto crtOpItr = workingSet.begin();
+        auto crtOp = *crtOpItr;
+        LDBG("Processing op: " << *crtOp);
+        if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
+          if (shouldRemove(
+                  makeTensorPtrOp,
+                  /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
+                  /*isBlockLoad=*/
+                  isa<tt::LoadOp>(crtOp) &&
+                      crtOp->hasAttr(
+                          ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
+            tensorPointersToRemove.insert(makeTensorPtrOp);
+            return;
+          }
+        } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
+          for (auto [arg, blockArg] :
+               llvm::zip(forOp.getInitArgs(),
+                         forOp.getBody()->getArguments().drop_front(
+                             forOp.getNumInductionVars()))) {
+            if (arg == makeTensorPtrOp) {
+              // add users of block arg
+              for (auto user : blockArg.getUsers()) {
+                workingSet.insert(user);
               }
             }
-          } else if (crtOp->getNumResults() > 0) {
-            // TODO: should we handle more than one result?
-            auto crtOpResult = crtOp->getResult(0);
-            LDBG("Not a load store and not a loop, adding users to working "
-                 "set.");
-            for (auto user : crtOpResult.getUsers()) {
-              workingSet.insert(user);
-            }
           }
-          workingSet.erase(crtOpItr);
+        } else if (crtOp->getNumResults() > 0) {
+          // TODO: should we handle more than one result?
+          auto crtOpResult = crtOp->getResult(0);
+          LDBG("Not a load store and not a loop, adding users to working "
+               "set.");
+          for (auto user : crtOpResult.getUsers()) {
+            workingSet.insert(user);
+          }
         }
+        workingSet.erase(crtOpItr);
       }
     });
 

From 5692c253ac1aa4fac81084bff7e25b77047090d9 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 9 Oct 2024 21:38:49 +0000
Subject: [PATCH 15/19] add stores too

---
 test/TritonIntelGPU/rewrite-tensor-pointer.mlir | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
index ae9c269bc7..55825208c9 100644
--- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
+++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -380,8 +380,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
     }
     %14 = tt.make_tensor_ptr %arg2, [%c1024_i64, %c4096_i64], [%c4096_i64, %c1_i64], [%9, %11] {order = array<i32: 1, 0>} : <tensor<256x256xf16, #dpas>>
     %15 = arith.truncf %13#0 : tensor<256x256xf32, #dpas> to tensor<256x256xf16, #dpas>
-    // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : !tt.ptr<tensor<256x256xf16, #[[DPAS]]>
-    tt.store %14, %15 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf16, #dpas>>
     tt.return
   }
 }

From 0a42f99348f5df4785399f3a0bc0937e4721cc80 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 10 Oct 2024 01:10:48 +0000
Subject: [PATCH 16/19] use explicit walker advance

---
 .../intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 81ed0c89d1..b6fff0affe 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -696,7 +696,7 @@ class TritonIntelGPURewriteTensorPointerPass
                       crtOp->hasAttr(
                           ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
             tensorPointersToRemove.insert(makeTensorPtrOp);
-            return;
+            return WalkResult::advance();
           }
         } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
           for (auto [arg, blockArg] :

From 7e1887043a0d77d30390a193e1cb6f4644bbd329 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 10 Oct 2024 01:14:59 +0000
Subject: [PATCH 17/19] Revert "use explicit walker advance"

this caused the lambda in walk to be typed which means we need an explicit return everywhere

This reverts commit 0a42f99348f5df4785399f3a0bc0937e4721cc80.
---
 .../intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index b6fff0affe..81ed0c89d1 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -696,7 +696,7 @@ class TritonIntelGPURewriteTensorPointerPass
                       crtOp->hasAttr(
                           ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
             tensorPointersToRemove.insert(makeTensorPtrOp);
-            return WalkResult::advance();
+            return;
           }
         } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
           for (auto [arg, blockArg] :

From 5b651b48eeaa3614e121d7eb66943bceb27114d9 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 10 Oct 2024 01:51:49 +0000
Subject: [PATCH 18/19] use WalkResult::advance

---
 .../lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 81ed0c89d1..13ca4bea4f 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -696,7 +696,7 @@ class TritonIntelGPURewriteTensorPointerPass
                       crtOp->hasAttr(
                           ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
             tensorPointersToRemove.insert(makeTensorPtrOp);
-            return;
+            return WalkResult::advance();
           }
         } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
           for (auto [arg, blockArg] :
@@ -721,6 +721,7 @@ class TritonIntelGPURewriteTensorPointerPass
         }
         workingSet.erase(crtOpItr);
       }
+      return WalkResult::advance();
     });
 
     auto markTensorPointerForRemoval = [this,

From cb97ab3ccad612f20e939c071be1e8f07cfb74e5 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 10 Oct 2024 02:04:49 +0000
Subject: [PATCH 19/19] default to removing tensor pointer

---
 test/TritonIntelGPU/rewrite-tensor-pointer.mlir        | 10 +++++-----
 .../TritonIntelGPUTransforms/RewriteTensorPointer.cpp  |  5 +++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
index 55825208c9..596f29fea7 100644
--- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
+++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -339,7 +339,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
 // -----
 
 // COM: Case 5:
-// COM: Check that a make tensor ptr with no loads is handled properly
+// COM: Check that a make tensor ptr with no loads is properly removed
 // CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
@@ -366,14 +366,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
     %7 = arith.remsi %0, %c64_i32 : i32
     %8 = arith.divsi %7, %4 : i32
     %9 = arith.muli %6, %c256_i32 : i32
-    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+    // CHECK-NOT: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
     %10 = tt.make_tensor_ptr %arg0, [%c1024_i64, %c5120_i64], [%c5120_i64, %c1_i64], [%9, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>
     %11 = arith.muli %8, %c256_i32 : i32
-    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+    // CHECK-NOT: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
     %12 = tt.make_tensor_ptr %arg1, [%c5120_i64, %c4096_i64], [%c1_i64, %c5120_i64], [%c0_i32, %11] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
     %13:3 = scf.for %arg3 = %c0_i32 to %c5120_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %10, %arg6 = %12) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>)  : i32 {
-      // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
-      // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+      // CHECK-NOT:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+      // CHECK-NOT:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
       %19 = tt.advance %arg5, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>
       %20 = tt.advance %arg6, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
       scf.yield %arg4, %19, %20 : tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 13ca4bea4f..0857ecba04 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -676,6 +676,7 @@ class TritonIntelGPURewriteTensorPointerPass
 
     DenseSet<Operation *> tensorPointersToRemove;
     mod.walk([&](tt::MakeTensorPtrOp makeTensorPtrOp) {
+      tensorPointersToRemove.insert(makeTensorPtrOp);
       DenseSet<Operation *> workingSet;
 
       LDBG("Considering: " << makeTensorPtrOp);
@@ -688,14 +689,14 @@ class TritonIntelGPURewriteTensorPointerPass
         auto crtOp = *crtOpItr;
         LDBG("Processing op: " << *crtOp);
         if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
-          if (shouldRemove(
+          if (!shouldRemove(
                   makeTensorPtrOp,
                   /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
                   /*isBlockLoad=*/
                   isa<tt::LoadOp>(crtOp) &&
                       crtOp->hasAttr(
                           ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
-            tensorPointersToRemove.insert(makeTensorPtrOp);
+            tensorPointersToRemove.erase(makeTensorPtrOp);
             return WalkResult::advance();
           }
         } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {