Drop boundary check and make isCandidate more restrictive

etiotto · etiotto · commit 85f5eb4e82dc · 2025-10-15T21:34:30.000Z
Signed-off-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/Triton/Intel/FuseReshape/fuse-reshape.mlir b/test/Triton/Intel/FuseReshape/fuse-reshape.mlir
@@ -12,7 +12,7 @@ tt.func public @fuseLoadWithReshape1(%arg0: !tt.ptr<tensor<256x32xbf16>>, %arg1:
   %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
   %0 = tt.make_tensor_ptr %arg1, [%c1_i64, %c64_i64, %c1024_i64], [%c1024_i64, %c4_i64, %c1_i64], [%c2_i32, %c1_i32, %c0_i32] {order = array<i32: 2, 1, 0>} : <tensor<1x32x256xbf16>>
   %1 = tt.load %arg0 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>>
-  %3 = tt.load %0 {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x32x256xbf16>>
+  %3 = tt.load %0 {boundaryCheck = array<i32: 2>} : !tt.ptr<tensor<1x32x256xbf16>>
   %4 = tt.reshape %3 : tensor<1x32x256xbf16> -> tensor<32x256xbf16>
   %5 = tt.dot %1, %4, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
   tt.return
@@ -26,17 +26,8 @@ tt.func public @fuseLoadWithReshape1(%arg0: !tt.ptr<tensor<256x32xbf16>>, %arg1:
 // CHECK: [[MUL2:%.*]] = arith.muli %c2_i32, [[TRUNC]] : i32
 // CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c1_i32 : i32
 // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [[[ADD1]], %c1024_i64], [%c4_i64, %c1_i64], [[[ADD2]], %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
-// CHECK: [[ADD3:%.*]] = arith.addi %c1_i32, %c32_i32 : i32
-// CHECK: [[TRUNC:%.*]] = arith.trunci %c4_i64_0 : i64 to i32
-// CHECK: [[COND:%.*]] = arith.cmpi ult, [[ADD3]], [[TRUNC]] : i32
-// CHECK: [[IF_RES:%.*]] = scf.if [[COND]] -> (tensor<32x256xbf16>) {
-// CHECK:   [[LOAD_B:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x256xbf16>>
-// CHECK:   scf.yield [[LOAD_B]] : tensor<32x256xbf16>
-// CHECK: } else {
-// CHECK:   [[ZERO:%.*]] = arith.constant dense<0.000000e+00> : tensor<32x256xbf16>
-// CHECK:   scf.yield [[ZERO]] : tensor<32x256xbf16>
-// CHECK: }
-// CHECK: tt.dot {{.*}}, [[IF_RES]], {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+// CHECK: [[LOAD_B:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<32x256xbf16>>
+// CHECK: tt.dot {{.*}}, [[LOAD_B]], {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
 
 // -----
 
@@ -113,7 +104,7 @@ tt.func public @fuseLoadWithReshape3(%a_ptr: !tt.ptr<f32> {tt.divisibility = 16
   %21 = arith.extsi %stride_bk : i32 to i64
   %22 = tt.make_tensor_ptr %b_ptr, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array<i32: 1, 0>} : <tensor<32x128xf32>>
   %accumulator:3 = scf.for %k = %c0_i32 to %K step %c32_i32 iter_args(%a_block_ptr = %18, %b_block_ptr = %22, %accumulator_0 = %cst) -> (!tt.ptr<tensor<1x256x32xf32>>, !tt.ptr<tensor<32x128xf32>>, tensor<256x128xf32>)  : i32 {
-    %25 = tt.load %a_block_ptr {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x256x32xf32>>
+    %25 = tt.load %a_block_ptr {boundaryCheck = array<i32: 2>} : !tt.ptr<tensor<1x256x32xf32>>
     %26 = tt.reshape %25 : tensor<1x256x32xf32> -> tensor<256x32xf32>
     %27 = tt.load %b_block_ptr {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x128xf32>>
     %28 = tt.dot %26, %27, %cst, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32>
@@ -137,20 +128,10 @@ tt.func public @fuseLoadWithReshape3(%a_ptr: !tt.ptr<f32> {tt.divisibility = 16
 // CHECK: [[MUL2:%.*]] = arith.muli %c0_i32, [[TRUNC]] : i32
 // CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c128_i32 : i32
 // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg0, [[[ADD1]], %16], [%17, %c1_i64], [[[ADD2]], %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf32>>
-// CHECK: [[CST_256:%.*]] = arith.constant 256 : i32
-// CHECK: [[ADD3:%.*]] = arith.addi %c128_i32, [[CST_256]] : i32
-// CHECK: [[TRUNC:%.*]] = arith.trunci [[EXT_M]] : i64 to i32
-// CHECK: [[COND:%.*]] = arith.cmpi ult, [[ADD3]], [[TRUNC]] : i32
 // CHECK: scf.for {{.*}} = %c0_i32 to {{.*}} step %c32_i32 iter_args([[ARG:%.*]] = [[PTR]]
-// CHECK: [[IF_RES:%.*]] = scf.if [[COND]] -> (tensor<256x32xf32>) {
 // CHECK:   [[LOAD_A:%.*]] = tt.load [[ARG]] {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<256x32xf32>>
-// CHECK:   scf.yield [[LOAD_A]] : tensor<256x32xf32>
-// CHECK: } else {
-// CHECK:   [[ZERO:%.*]] = arith.constant dense<0.000000e+00> : tensor<256x32xf32>
-// CHECK:   scf.yield [[ZERO]] : tensor<256x32xf32>
-// CHECK: }
-// CHECK: tt.dot [[IF_RES]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32>
-// CHECK: tt.advance [[ARG]], [%c0_i32, %c32_i32] : <tensor<256x32xf32>>
+// CHECK:   tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32>
+// CHECK:   tt.advance [[ARG]], [%c0_i32, %c32_i32] : <tensor<256x32xf32>>
 
 // -----
 
@@ -186,7 +167,6 @@ tt.func public @fuseLoadWithReshape4(%arg0: i32, %arg1: !tt.ptr<f16>, %arg2: !tt
     scf.yield %add : i32
   }
   tt.return
-
 }
 // CHECK-LABEL: fuseLoadWithReshape4
 // CHECK-NOT: tt.reshape
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp
@@ -24,59 +24,6 @@ namespace mlir::triton::intel {
 
 namespace {
 
-scf::IfOp createIfBlock(OpBuilder &builder, Location loc, arith::CmpIOp condOp,
-                        tt::LoadOp loadOp) {
-  assert(isa<RankedTensorType>(loadOp.getType()) &&
-         "Unexpected load result type");
-
-  auto tensorType = cast<RankedTensorType>(loadOp.getType());
-  assert(tensorType.getShape().size() == 2);
-  Type elemType = tensorType.getElementType();
-
-  builder.setInsertionPointAfter(loadOp);
-  auto ifOp = builder.create<scf::IfOp>(loc, tensorType, condOp, true, true);
-  loadOp->moveBefore(ifOp.thenBlock(), ifOp.thenBlock()->end());
-  builder.setInsertionPointAfter(loadOp);
-  builder.create<scf::YieldOp>(loc, loadOp->getResult(0));
-
-  builder.setInsertionPointToStart(ifOp.elseBlock());
-  tt::PaddingOption padding = (!loadOp.getPadding())
-                                  ? tt::PaddingOption::PAD_ZERO
-                                  : *loadOp.getPadding();
-  DenseElementsAttr denseAttr = nullptr;
-  switch (padding) {
-  case tt::PaddingOption::PAD_ZERO: {
-    denseAttr = DenseElementsAttr::get(cast<ShapedType>(tensorType),
-                                       builder.getZeroAttr(elemType));
-  } break;
-  case tt::PaddingOption::PAD_NAN: {
-    assert(elemType.isF128() && "Expecting a floating point type");
-    auto NaNVal =
-        APFloat::getNaN(cast<FloatType>(elemType).getFloatSemantics());
-    denseAttr = DenseElementsAttr::get(cast<ShapedType>(tensorType),
-                                       builder.getFloatAttr(elemType, NaNVal));
-  } break;
-  default:
-    llvm_unreachable("Unhandled padding kind");
-  }
-  assert(denseAttr && "Expecting a valid attribute");
-
-  Value other = builder.create<arith::ConstantOp>(loc, tensorType, denseAttr);
-  builder.create<scf::YieldOp>(loc, other);
-  return ifOp;
-}
-
-scf::IfOp createCheckedLoad(OpBuilder &builder, arith::CmpIOp cmpOp,
-                            tt::LoadOp loadOp) {
-  scf::IfOp ifOp = createIfBlock(builder, loadOp.getLoc(), cmpOp, loadOp);
-  loadOp->replaceUsesWithIf(ifOp, [&](OpOperand &operand) {
-    if (auto yieldOp = dyn_cast<scf::YieldOp>(operand.getOwner()))
-      return yieldOp->getParentOp() != ifOp;
-    return true;
-  });
-  return ifOp;
-};
-
 // Transform:
 //   %one = arith.constant 1 : i64
 //   %ptr = make_tensor_ptr %q_view, [%q, %q_23, %q_24],
@@ -298,24 +245,12 @@ class FuseReshape {
     propagateToUsers(ptr, chain, mapping);
     cleanUp.insert(makeTensorPtrOp);
 
-    // We have collapsed 2 dimensions into one, therefore we might have to
-    // materialize the boundary check for the new collapsed dimension. There
-    // are 2 possibilities:
-    //   a) if the load checks only the innermost dimension, we are ok because
-    //      we haven't collapsed that dimension
-    //   b) if the load check the new outermost dimension the boundary check
-    //      on the load is not sufficient and we have to materialize the
-    //      correct boundary check. Example:
-    //                 OLD PTR      NEW PTR
-    //        shape:   [20, 10, 5] -> [210, 5]
-    //        strides: [50,  5, 1] -> [  5, 1]
-    //
-    //      Consider a load offset of [1, 11, 1], this access is clearly
-    //      out-of-bound in dim 1 (11 > 10). However, the new offset is no
-    //      longer out-of-bound (5 < 210).
+    // We have collapsed 2 dimensions into one, therefore we need to adjust the
+    // boundary check of the new load.
     auto newLoadOp =
         cast<tt::LoadOp>(mapping.lookup(static_cast<Operation *>(loadOp)));
     ArrayRef<int> boundaryCheck = newLoadOp.getBoundaryCheck();
+
     switch (boundaryCheck.size()) {
     case 0:
       break;
@@ -327,26 +262,7 @@ class FuseReshape {
         newBoundaryCheck.push_back((boundaryCheck[0] - 1));
       if (boundaryCheck.size() == 2 && (boundaryCheck[1] - 1) != 0)
         newBoundaryCheck.push_back(boundaryCheck[1] - 1);
-
       newLoadOp.setBoundaryCheck(newBoundaryCheck);
-
-      if (llvm::any_of(newBoundaryCheck, [&](unsigned boundIdx) {
-            return boundIdx == newOutermostDimIdx + 1;
-          })) {
-        unsigned oldIdx = newOutermostDimIdx + 1;
-        auto tensorType = cast<RankedTensorType>(loadOp.getResult().getType());
-        Type elemType = tensorType.getElementType();
-        ArrayRef<int64_t> resShape = tensorType.getShape();
-        auto add = builder.create<arith::AddIOp>(
-            loc, offsets[oldIdx],
-            builder.create<arith::ConstantIntOp>(loc, offsets[oldIdx].getType(),
-                                                 resShape[oldIdx]));
-        auto cmpOp = builder.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::ult, add,
-            builder.create<arith::TruncIOp>(loc, add.getResult().getType(),
-                                            shapes[oldIdx]));
-        createCheckedLoad(builder, cmpOp, newLoadOp);
-      }
     } break;
     default:
       // Note: while selecting candidates, we already ensured that the original
@@ -361,11 +277,13 @@ class FuseReshape {
   // Where:
   //  - the reshape operation drops the outermost dimension of the operand,
   //    which is a 3-dim tensor with outermost dimension extent equal to one
-  //  - the reshape result is used by the dot operation
+  //  - the reshape result is used by a dot operation
   //  - the reshape operation uses the result of a 3-dim load operation on a
   //    block pointer (transitively) defined by a `make_tensor_ptr` operation
   //  - the block pointer points to a tensor that has extent equal to 1 on the
   //    outermost dimension
+  //  - the load operation doesn't have boundary checks on either of the
+  //    dimensions collapsed
   bool isCandidate(tt::ReshapeOp reshapeOp) const {
     assert(reshapeOp && "Expecting a valid reshape operation");
 
@@ -384,8 +302,7 @@ class FuseReshape {
         return false;
     }
 
-    // Check whether \p reshapeOp is used by a `dotOp` (directly or
-    // indirectly).
+    // Check whether \p reshapeOp is used by a `dotOp`.
     auto usedByDotOp = [](tt::ReshapeOp reshapeOp) {
       if (!reshapeOp->hasOneUse())
         return false;
@@ -405,6 +322,7 @@ class FuseReshape {
     if (!usedByDotOp(reshapeOp))
       return false;
 
+    // The reshape operation uses the result of a load operation.
     Operation *defOp = reshapeOp.getSrc().getDefiningOp();
     if (!defOp || !isa<tt::LoadOp>(defOp))
       return false;
@@ -413,6 +331,8 @@ class FuseReshape {
     if (!loadOp->hasOneUse())
       return false;
 
+    // The load uses a 3-dim block pointer defined by a make_tensor_ptr
+    // operation.
     Type ptrType = loadOp.getPtr().getType();
     if (!tt::isTensorPointerType(ptrType))
       return false;
@@ -432,14 +352,14 @@ class FuseReshape {
     if (order.front() != tensorTy.getRank() - 1)
       return false;
 
-    // Ensure that the innermost stride is one.
     unsigned innermostDimIdx = 0;
-    for (int i : order) {
-      if (i == 0)
+    for (int idx : order) {
+      if (idx == 0)
         break;
       ++innermostDimIdx;
     }
 
+    // Ensure that the innermost stride is one.
     auto strides = makeTensorPtrOp->getStrides();
     Value innermostStride = strides[innermostDimIdx];
     if (!innermostStride.getDefiningOp() ||
@@ -451,9 +371,9 @@ class FuseReshape {
     if (integerCst.value() != 1)
       return false;
 
-    // Ensure the load boundary check doesn't check the outermost dimension.
-    return llvm::none_of(loadOp.getBoundaryCheck(),
-                         [](int val) { return val == 0; });
+    // Ensure the load operation checks at most the innermost dimension.
+    return llvm::all_of(loadOp.getBoundaryCheck(),
+                        [&](int idx) { return idx == innermostDimIdx; });
   }
 
   // Prune chains that cannot be handled during fusion. For example,