address review comments

whitneywhtsang · whitneywhtsang · commit 02602add3056 · 2025-10-16T00:25:15.000Z
diff --git a/test/Triton/Intel/FuseReshape/fuse-reshape.mlir b/test/Triton/Intel/FuseReshape/fuse-reshape.mlir
@@ -63,7 +63,7 @@ tt.func public @fuseLoadWithReshape2(%arg0: !tt.ptr<tensor<32x256xbf16>>, %arg1:
 // CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c0_i32 : i32
 // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [%c1024_i64, [[ADD1]]], [%c1_i64, %c512_i64], [%c32_i32, [[ADD2]]] {order = array<i32: 0, 1>} : <tensor<256x32xbf16>>
 // CHECK: scf.for
-// CHECK:   [[LOAD_A:%.*]] = tt.load [[PTR]] : !tt.ptr<tensor<256x32xbf16>>
+// CHECK:   [[LOAD_A:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<256x32xbf16>>
 // CHECK:   tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
 
 // -----
diff --git a/third_party/intel/include/Dialect/Triton/Transforms/Passes.td b/third_party/intel/include/Dialect/Triton/Transforms/Passes.td
@@ -53,15 +53,15 @@ def TritonIntelFuseReshape
     For example, given:
         %ptr = tt.make_tensor_ptr %base_ptr, [%s0, %s1, %s2], [%a, %b, %c], [%x, %y, %z]
                                   {order = array<i32: 2, 1, 0>} : <tensor<1x512x64xf16>>
-        %load = tt.load %ptr {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x512x64xf16>>
+        %load = tt.load %ptr {boundaryCheck = array<i32: 2>} : !tt.ptr<tensor<1x512x64xf16>>
         %A = tt.reshape %load : tensor<1x512x64xf16> -> tensor<512x64xf16>
         %dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
 
     The transformation drops the reshape operation, and generates:
         %div = %a / %b
         %ptr = tt.make_tensor_ptr %base_ptr, [%s0 * %div + %s1, %s2], [%b, %c], [%x * %div + %y, %z]
                                   {order = array<i32: 1, 0>} : <tensor<512x64xf16>>
-        %A = tt.load %ptr {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<512x64xf16>>
+        %A = tt.load %ptr {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<512x64xf16>>
         %dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
   }];
 
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp
@@ -24,23 +24,6 @@ namespace mlir::triton::intel {
 
 namespace {
 
-// Transform:
-//   %one = arith.constant 1 : i64
-//   %ptr = make_tensor_ptr %q_view, [%q, %q_23, %q_24],
-//            [%q_25, %q_26, %one], [%offset_5, %offset_1_13, %q_28]
-//            {order = array<i32: 2, 1, 0>} : <tensor<1x512x64xf16>>
-//   %load = tt.load %ptr {boundaryCheck = array<i32: 1, 2>}
-//         : !tt.ptr<tensor<1x512x64xf16>>
-//   %a = tt.reshape %load : tensor<1x512x64xf16> -> tensor<512x64xf16>
-//   tt.dot(%a, ...)
-// into:
-//   %one = arith.constant 1 : i64
-//   %ptr = make_tensor_ptr %q_view, [%q_23, %q_24], [%q_26, %one],
-//            [%offset_1_13, %offset_5*%q_25+%q_28]
-//            {order = array<i32: 1, 0>} : <tensor<512x64xf16>>
-//   %a = tt.load %ptr {boundaryCheck = array<i32: 0, 1>}
-//      : !tt.ptr<tensor<512x64xf16>>
-//   tt.dot(%a, ...)
 class FuseReshape {
 private:
   SmallPtrSet<Operation *, 8> cleanUp;
@@ -250,25 +233,10 @@ class FuseReshape {
     auto newLoadOp =
         cast<tt::LoadOp>(mapping.lookup(static_cast<Operation *>(loadOp)));
     ArrayRef<int> boundaryCheck = newLoadOp.getBoundaryCheck();
-
-    switch (boundaryCheck.size()) {
-    case 0:
-      break;
-    case 1:
-    // intentional fall-through
-    case 2: {
-      SmallVector<int> newBoundaryCheck;
-      if ((boundaryCheck[0] - 1) != 0)
-        newBoundaryCheck.push_back((boundaryCheck[0] - 1));
-      if (boundaryCheck.size() == 2 && (boundaryCheck[1] - 1) != 0)
-        newBoundaryCheck.push_back(boundaryCheck[1] - 1);
-      newLoadOp.setBoundaryCheck(newBoundaryCheck);
-    } break;
-    default:
-      // Note: while selecting candidates, we already ensured that the original
-      // load's boundary check doesn't check dim zero. So its max rank should
-      // be 2.
-      assert(boundaryCheck.size() != 3 && "Unexpected boundary check rank");
+    for (int idx : boundaryCheck) {
+      assert(idx == (newInnermostDimIdx + 1) &&
+             "Unexpected boundary check idx");
+      newLoadOp.setBoundaryCheck({static_cast<int>(newInnermostDimIdx)});
     }
   }
 
@@ -359,18 +327,6 @@ class FuseReshape {
       ++innermostDimIdx;
     }
 
-    // Ensure that the innermost stride is one.
-    auto strides = makeTensorPtrOp->getStrides();
-    Value innermostStride = strides[innermostDimIdx];
-    if (!innermostStride.getDefiningOp() ||
-        !isa<arith::ConstantIntOp>(innermostStride.getDefiningOp()))
-      return false;
-
-    auto integerCst =
-        cast<arith::ConstantIntOp>(innermostStride.getDefiningOp());
-    if (integerCst.value() != 1)
-      return false;
-
     // Ensure the load operation checks at most the innermost dimension.
     return llvm::all_of(loadOp.getBoundaryCheck(),
                         [&](int idx) { return idx == innermostDimIdx; });