Fix assertion in findDefiningMakeTensorPtrOp (#4606)

etiotto · web-flow · commit 74d50592bdcc · 2025-07-02T17:20:05.000Z
Fixes issue #4605. --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/test/TritonIntelGPU/dot-operands.mlir b/test/TritonIntelGPU/dot-operands.mlir
@@ -303,3 +303,38 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32} {
   // CHECK-LABEL: doNotFuseLoadWithTrans4
   // CHECK: tt.trans
 }
+
+// -----
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [16, 0], [0, 16], [0, 32]], lane = [[1, 0], [2, 0], [4, 0], [8, 0]], warp = [[0, 0], [0, 0]], block = []}>
+#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // COM: Ensure tt.trans is not fused with tt.load when the load uses a pointer yielded by a function call.
+  tt.func @func(%cond: i1, %p1: !tt.ptr<tensor<32x64xf16, #linear>>, %p2: !tt.ptr<tensor<32x64xf16, #linear>>) -> !tt.ptr<tensor<32x64xf16, #linear>> attributes {noinline = true} {
+    %0 = arith.select %cond, %p1, %p2 : i1, !tt.ptr<tensor<32x64xf16, #linear>>
+    tt.return %0 : !tt.ptr<tensor<32x64xf16, #linear>>
+  }
+  tt.func public @doNotFuseLoadWithTrans5(%arg0: i32, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %cond: i1) {
+    %c32_i32 = arith.constant 32 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i64 = arith.constant 64 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma>
+    %7 = tt.make_tensor_ptr %arg1, [%c1_i64, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>
+    %9 = tt.make_tensor_ptr %arg2, [%c1_i64, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf16, #linear>>
+    %24 = tt.advance %7, [%arg0, %c0_i32] : <tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>
+    %25 = tt.load %24 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>
+    %29:1 = scf.for %arg9 = %c0_i32 to %arg0 step %c32_i32 iter_args(%arg13 = %arg0) -> (i32) : i32 {
+      %adv1 = tt.advance %9, [%arg13, %c0_i32] : <tensor<32x64xf16, #linear>>
+      %adv2 = tt.advance %9, [%c0_i32, %arg13] : <tensor<32x64xf16, #linear>>
+      %adv3 = tt.call @func(%cond, %adv1, %adv2) : (i1, !tt.ptr<tensor<32x64xf16, #linear>>, !tt.ptr<tensor<32x64xf16, #linear>>) -> !tt.ptr<tensor<32x64xf16, #linear>>
+      %load1 = tt.load %adv3 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<32x64xf16, #linear>>
+      %trans1 = tt.trans %load1 {order = array<i32: 1, 0>} : tensor<32x64xf16, #linear> -> tensor<64x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %dot1 = tt.dot %25, %trans1, %cst_3, inputPrecision = tf32 : tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x32xf32, #mma>
+      %76 = arith.addi %arg13, %c32_i32 : i32
+      scf.yield %76 : i32
+    }
+    tt.return
+  }
+  // CHECK-LABEL: doNotFuseLoadWithTrans5
+  // CHECK: tt.trans
+}
diff --git a/third_party/intel/lib/Utils/Utility.cpp b/third_party/intel/lib/Utils/Utility.cpp
@@ -2,6 +2,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include <optional>
@@ -52,20 +53,17 @@ std::optional<tt::MakeTensorPtrOp> findDefiningMakeTensorPtrOp(Value val) {
 
   if (auto poisonOp = val.getDefiningOp<ub::PoisonOp>())
     return std::nullopt;
+  if (auto callOp = val.getDefiningOp<tt::CallOp>())
+    return std::nullopt;
   if (auto advanceOp = val.getDefiningOp<tt::AdvanceOp>())
     return findDefiningMakeTensorPtrOp(advanceOp.getPtr());
   if (auto makePtrOp = val.getDefiningOp<tt::MakeTensorPtrOp>())
     return makePtrOp;
   if (auto opRes = dyn_cast<OpResult>(val)) {
     Operation *defOp = opRes.getOwner();
-    if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
-      Value val = forOp.getYieldedValues()[opRes.getResultNumber()];
-      return findDefiningMakeTensorPtrOp(val);
-    }
-    if (auto whileOp = dyn_cast<scf::WhileOp>(defOp)) {
-      Value val = whileOp.getYieldedValues()[opRes.getResultNumber()];
-      return findDefiningMakeTensorPtrOp(val);
-    }
+    if (auto loopOp = dyn_cast<LoopLikeOpInterface>(defOp))
+      return findDefiningMakeTensorPtrOp(
+          loopOp.getYieldedValues()[opRes.getResultNumber()]);
     if (auto ifOp = dyn_cast<scf::IfOp>(defOp)) {
       // Give up if the 2 possible definitions aren't the same.
       Region &thenRgn = ifOp.getThenRegion();
@@ -78,10 +76,10 @@ std::optional<tt::MakeTensorPtrOp> findDefiningMakeTensorPtrOp(Value val) {
                cast<scf::YieldOp>(elseRgn.getBlocks().front().getTerminator());
       Value thenVal = thenYieldOp->getOperand(opRes.getResultNumber()),
             elseVal = elseYieldOp->getOperand(opRes.getResultNumber());
-      std::optional<tt::MakeTensorPtrOp> thenDef =
-          findDefiningMakeTensorPtrOp(thenVal);
-      std::optional<tt::MakeTensorPtrOp> elseDef =
-          findDefiningMakeTensorPtrOp(elseVal);
+      std::optional<tt::MakeTensorPtrOp> thenDef = findDefiningMakeTensorPtrOp(
+                                             thenVal),
+                                         elseDef = findDefiningMakeTensorPtrOp(
+                                             elseVal);
       if (!thenDef || !elseDef || *thenDef != *elseDef)
         return std::nullopt;
       return thenDef;
@@ -90,10 +88,10 @@ std::optional<tt::MakeTensorPtrOp> findDefiningMakeTensorPtrOp(Value val) {
       // Give up if the 2 possible definitions aren't the same.
       Value trueVal = selectOp.getTrueValue(),
             falseVal = selectOp.getFalseValue();
-      std::optional<tt::MakeTensorPtrOp> trueDef =
-          findDefiningMakeTensorPtrOp(trueVal);
-      std::optional<tt::MakeTensorPtrOp> falseDef =
-          findDefiningMakeTensorPtrOp(falseVal);
+      std::optional<tt::MakeTensorPtrOp> trueDef = findDefiningMakeTensorPtrOp(
+                                             trueVal),
+                                         falseDef = findDefiningMakeTensorPtrOp(
+                                             falseVal);
       if (!trueDef || !falseDef || *trueDef != *falseDef)
         return std::nullopt;
       return trueDef;
@@ -143,8 +141,8 @@ Value getFinalValue(Value value) {
   assert(value && "Expecting a valid value");
   Operation *defOp = value.getDefiningOp();
   if (!defOp) {
-    // look init values outside the loop
-    BlockArgument blockArg = cast<BlockArgument>(value);
+    // Look up init values outside the loop.
+    auto blockArg = cast<BlockArgument>(value);
     Operation *parentOp = blockArg.getOwner()->getParentOp();
     if (scf::ForOp forOp = dyn_cast<scf::ForOp>(parentOp)) {
       if (blockArg == forOp.getInductionVar())