[tritonintelgpu-remove-layout-conversions]: Failure to find make_tensor_ptr operation for tt.store within while loop. (#4330)

etiotto · web-flow · commit ce49a59f395f · 2025-05-28T17:08:38.000Z
This PR introduces a centralized helper to trace back to the defining MakeTensorPtrOp for a tensor pointer, and updates two GPU transformation passes to use it. It fixes issue #4336. --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/test/TritonIntelGPU/combine.mlir b/test/TritonIntelGPU/combine.mlir
@@ -2564,3 +2564,40 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+// CHECK-DAG: #[[$BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK-NOT: #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32, ttig.support_sg_2d_block} {
+  // CHECK-LABEL: while_using_advanced_ptr
+  tt.func public @while_using_advanced_ptr(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1_i64 = arith.constant 1 : i64
+    %cst = arith.constant dense<5.000000e+00> : tensor<8x128xf32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %0 = tt.get_program_id x : i32
+    %2 = arith.extsi %arg2 : i32 to i64
+    %3 = arith.extsi %arg1 : i32 to i64
+    // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<8x128xf32, #[[$BLOCKED]]>>
+    %4 = tt.make_tensor_ptr %arg0, [%3, %2], [%2, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<8x128xf32, #blocked1>>
+    %8 = arith.cmpi eq, %0, %c0_i32 : i32
+    %6 = scf.while (%arg3 = %4) : (!tt.ptr<tensor<8x128xf32, #blocked1>>) -> (!tt.ptr<tensor<8x128xf32, #blocked1>>) {
+      %7 = arith.cmpi slt, %0, %c0_i32 : i32
+      scf.condition(%7) %arg3 : !tt.ptr<tensor<8x128xf32, #blocked1>>
+    } do {
+    ^bb0(%arg3: !tt.ptr<tensor<8x128xf32, #blocked1>>):
+      // CHECK-NOT: ttg.convert_layout
+      // CHECK: [[SEL:%.*]] = arith.select {{.*}} : !tt.ptr<tensor<8x128xf32, #[[$BLOCKED]]>>
+      // CHECK: [[PTR1:%.*]] = tt.advance [[SEL]], {{.*}} : <tensor<8x128xf32, #[[$BLOCKED]]>>
+      // CHECK: tt.store [[PTR1]], {{.*}} : !tt.ptr<tensor<8x128xf32, #[[$BLOCKED]]>>
+      %12 = arith.select %8, %4, %arg3 : !tt.ptr<tensor<8x128xf32, #blocked1>>
+      %14 = tt.advance %12, [%0, %0] : <tensor<8x128xf32, #blocked1>>
+      %18 = ttg.convert_layout %cst : tensor<8x128xf32, #blocked> -> tensor<8x128xf32, #blocked1>
+      tt.store %14, %18 : !tt.ptr<tensor<8x128xf32, #blocked1>>
+      scf.yield %12 : !tt.ptr<tensor<8x128xf32, #blocked1>>
+    }
+    tt.return
+  }
+}
diff --git a/third_party/intel/include/Utils/Utility.h b/third_party/intel/include/Utils/Utility.h
@@ -4,6 +4,9 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Value.h"
 
+namespace mlir::triton {
+class MakeTensorPtrOp;
+}
 namespace mlir::triton::intel {
 
 // Lookup for a integer constant with the given value and bitwidth in the
@@ -12,6 +15,10 @@ namespace mlir::triton::intel {
 Value findOrCreateIntConstant(Location loc, int val, unsigned bitWidth,
                               OpBuilder &builder);
 
+// Find the defining makeTensorPtrOp operation of the given value.
+std::optional<mlir::triton::MakeTensorPtrOp>
+findDefiningMakeTensorPtrOp(Value val);
+
 // This function folds the `op` operation and returns the constant value if it
 // has successfully folded to a constant. Otherwise, it returns `std::nullopt`.
 std::optional<int64_t> getFoldedConstantValue(Operation *op);
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
@@ -2,12 +2,13 @@
 #include "intel/include/Dialect/TritonIntelGPU/IR/Utils.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "intel/include/Utils/Utility.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Support/LLVM.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Tools/StrUtil.h"
@@ -126,62 +127,6 @@ struct CoalescePass
                                  tensorType.getElementType(), encoding);
   }
 
-  // Find the defining makeTensorPtrOp operation of the given value.
-  static std::optional<tt::MakeTensorPtrOp>
-  findDefiningMakeTensorPtrOp(Value val) {
-    LLVM_DEBUG({
-      llvm::dbgs() << "[" DEBUG_TYPE "]: \t"
-                   << "Attempting to find `makeTensorPtrOp` defining: " << val
-                   << "\n";
-    });
-
-    if (auto arg = dyn_cast<BlockArgument>(val)) {
-      Operation *parentOp = arg.getParentBlock()->getParentOp();
-
-      Value loopArg;
-      if (auto forOp = dyn_cast<scf::ForOp>(parentOp))
-        loopArg = forOp.getInitArgs()[arg.getArgNumber() - 1];
-      else if (auto whileOp = dyn_cast<scf::WhileOp>(parentOp))
-        loopArg = whileOp.getInits()[arg.getArgNumber()];
-      else
-        llvm_unreachable("Unexpected parent operator");
-
-      return findDefiningMakeTensorPtrOp(loopArg);
-    }
-
-    if (auto advanceOp = val.getDefiningOp<tt::AdvanceOp>())
-      return findDefiningMakeTensorPtrOp(advanceOp.getPtr());
-    if (auto makePtrOp = val.getDefiningOp<tt::MakeTensorPtrOp>())
-      return makePtrOp;
-    if (auto opRes = dyn_cast<OpResult>(val)) {
-      Operation *defOp = opRes.getOwner();
-      if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
-        Value val = forOp.getYieldedValues()[opRes.getResultNumber()];
-        return findDefiningMakeTensorPtrOp(val);
-      }
-      if (auto whileOp = dyn_cast<scf::WhileOp>(defOp)) {
-        Value val = whileOp.getYieldedValues()[opRes.getResultNumber()];
-        return findDefiningMakeTensorPtrOp(val);
-      }
-      if (auto selectOp = dyn_cast<arith::SelectOp>(defOp)) {
-        // Give up if the 2 possible definitions aren't the same.
-        Value trueVal = selectOp.getTrueValue(),
-              falseVal = selectOp.getFalseValue();
-        std::optional<tt::MakeTensorPtrOp> trueDef =
-            findDefiningMakeTensorPtrOp(trueVal);
-        std::optional<tt::MakeTensorPtrOp> falseDef =
-            findDefiningMakeTensorPtrOp(falseVal);
-        if (!trueDef || !falseDef || *trueDef != *falseDef)
-          return std::nullopt;
-        return trueDef;
-      }
-
-      assert(false && "unhandled operation");
-    }
-
-    return std::nullopt;
-  }
-
   static bool filterUser(Operation *op) {
     // Yield operations trigger updating the layout of the containing loop
     // results, don't skip them.
@@ -446,10 +391,11 @@ struct CoalescePass
         newArgs.push_back(builder.create<ttg::ConvertLayoutOp>(
             op->getLoc(), newType, operand));
       } else {
-        assert(isa<tt::PointerType>(operand.getType()) &&
+        assert(tt::isTensorPointerType(operand.getType()) &&
                "Expecting operand to have blocked pointer type");
-        auto defOp = findDefiningMakeTensorPtrOp(operand);
-        assert(defOp && "Expected a make_tensor_ptr operation");
+        std::optional<tt::MakeTensorPtrOp> defOp =
+            triton::intel::findDefiningMakeTensorPtrOp(operand);
+        assert(defOp && "Expecting a MakeTensorPtr operation");
         LLVM_DEBUG({
           llvm::dbgs() << "[" DEBUG_TYPE "]: Found definition: " << defOp
                        << "\n";
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -13,8 +13,10 @@
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
+#include "intel/include/Utils/Utility.h"
 
 #include "triton/Analysis/Utility.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include <deque>
@@ -731,12 +733,12 @@ bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) {
     return false;
 
   // Locate the operation that created the block pointer.
-  Operation *defOp = ptr.getDefiningOp();
-  while (auto advanceOp = dyn_cast<AdvanceOp>(defOp))
-    defOp = advanceOp.getPtr().getDefiningOp();
-  assert(isa<MakeTensorPtrOp>(defOp) &&
-         "MakeTensorPtrOp should be the only op that creates a tensor pointer");
-  auto makeTensorPtrOp = cast<MakeTensorPtrOp>(defOp);
+  std::optional<triton::MakeTensorPtrOp> defOp =
+      triton::intel::findDefiningMakeTensorPtrOp(ptr);
+  if (!defOp)
+    return false;
+
+  triton::MakeTensorPtrOp makeTensorPtrOp = *defOp;
 
   // DPAS encoding have to be propagated if conversion from a DPAS layout to
   // another layout has been done before.
@@ -1585,8 +1587,9 @@ void hoistConvert(ModuleOp module) {
 }
 
 class TritonIntelGPURemoveLayoutConversionsPass
-    : public intel::impl::TritonIntelGPURemoveLayoutConversionsBase<
-          TritonIntelGPURemoveLayoutConversionsPass> {
+    : public triton::gpu::intel::impl::
+          TritonIntelGPURemoveLayoutConversionsBase<
+              TritonIntelGPURemoveLayoutConversionsPass> {
 public:
   // Cleanup convert ops.
   void cleanupConvertOps() {
diff --git a/third_party/intel/lib/Utils/Utility.cpp b/third_party/intel/lib/Utils/Utility.cpp
@@ -33,6 +33,54 @@ Value findOrCreateIntConstant(Location loc, int val, unsigned bitWidth,
              : builder.createOrFold<arith::ConstantIntOp>(loc, val, bitWidth);
 }
 
+std::optional<tt::MakeTensorPtrOp> findDefiningMakeTensorPtrOp(Value val) {
+  if (auto arg = dyn_cast<BlockArgument>(val)) {
+    Operation *parentOp = arg.getParentBlock()->getParentOp();
+
+    Value loopArg;
+    if (auto forOp = dyn_cast<scf::ForOp>(parentOp))
+      loopArg = forOp.getInitArgs()[arg.getArgNumber() - 1];
+    else if (auto whileOp = dyn_cast<scf::WhileOp>(parentOp))
+      loopArg = whileOp.getInits()[arg.getArgNumber()];
+    else
+      llvm_unreachable("Unexpected parent operator");
+
+    return findDefiningMakeTensorPtrOp(loopArg);
+  }
+
+  if (auto advanceOp = val.getDefiningOp<tt::AdvanceOp>())
+    return findDefiningMakeTensorPtrOp(advanceOp.getPtr());
+  if (auto makePtrOp = val.getDefiningOp<tt::MakeTensorPtrOp>())
+    return makePtrOp;
+  if (auto opRes = dyn_cast<OpResult>(val)) {
+    Operation *defOp = opRes.getOwner();
+    if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
+      Value val = forOp.getYieldedValues()[opRes.getResultNumber()];
+      return findDefiningMakeTensorPtrOp(val);
+    }
+    if (auto whileOp = dyn_cast<scf::WhileOp>(defOp)) {
+      Value val = whileOp.getYieldedValues()[opRes.getResultNumber()];
+      return findDefiningMakeTensorPtrOp(val);
+    }
+    if (auto selectOp = dyn_cast<arith::SelectOp>(defOp)) {
+      // Give up if the 2 possible definitions aren't the same.
+      Value trueVal = selectOp.getTrueValue(),
+            falseVal = selectOp.getFalseValue();
+      std::optional<tt::MakeTensorPtrOp> trueDef =
+          findDefiningMakeTensorPtrOp(trueVal);
+      std::optional<tt::MakeTensorPtrOp> falseDef =
+          findDefiningMakeTensorPtrOp(falseVal);
+      if (!trueDef || !falseDef || *trueDef != *falseDef)
+        return std::nullopt;
+      return trueDef;
+    }
+
+    assert(false && "unhandled operation");
+  }
+
+  return std::nullopt;
+}
+
 std::optional<int64_t> getFoldedConstantValue(Operation *op) {
   SmallVector<OpFoldResult> results;
   if (failed(op->fold(results)))