Fix "off by one" error in RemoveMask pass (#4194)

etiotto · web-flow · commit e07bc131977d · 2025-05-14T16:44:30.000+02:00
Fixes #4186, #4187. --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/test/Triton/Intel/RemoveMasks/loop-canonical-masks.mlir b/test/Triton/Intel/RemoveMasks/loop-canonical-masks.mlir
@@ -110,6 +110,7 @@ module {
   // CHECK: }
 
   tt.func public @test_kernel2(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
+    %c7_i32 = arith.constant 7 : i32
     %c8_i32 = arith.constant 8 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32>
     %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x256xf16>
@@ -164,7 +165,7 @@ module {
     %33 = arith.addi %31, %32 : tensor<64x256xi32>
     %34 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<64x256x!tt.ptr<f16>>
     %35 = tt.addptr %34, %33 : tensor<64x256x!tt.ptr<f16>>, tensor<64x256xi32>
-    %36:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %35) -> (tensor<128x256xf32>, tensor<128x64x!tt.ptr<f16>>, tensor<64x256x!tt.ptr<f16>>)  : i32 {
+    %36:3 = scf.for %arg3 = %c0_i32 to %c7_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %35) -> (tensor<128x256xf32>, tensor<128x64x!tt.ptr<f16>>, tensor<64x256x!tt.ptr<f16>>)  : i32 {
       %51 = arith.muli %arg3, %c64_i32 : i32
       %52 = arith.subi %c512_i32, %51 : i32
       %53 = tt.splat %52 : i32 -> tensor<1x64xi32>
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp
@@ -34,10 +34,12 @@ class MaskValidatorBase {
 
   // Create the loop versioning condition based on the mask.
   virtual Value getVersioningCond(scf::ForOp &forOp, Value mask) const = 0;
+
+  virtual std::string getName() const = 0;
 };
 
 // A mask validator which ensures that the mask can be reduced to the form:
-//  `END < N-i*END`.
+//  `END-1 < N-i*END`
 class CanonicalMaskValidator final : public MaskValidatorBase {
 public:
   // This structure is used to store the information about a mask in canonical
@@ -47,7 +49,7 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
     unsigned END;
   };
 
-  // Check whether the mask is equivalent to the form: `END < N-i*END`.
+  // Check whether the mask is equivalent to the form: `END-1 < N-i*END`.
   virtual bool isValidMask(scf::ForOp &forOp, Value mask) const {
     assert(mask && "Expecting a valid mask");
 
@@ -102,8 +104,8 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
   // `(N+END-1)%END > 0 && N > END`.
   virtual Value getVersioningCond(scf::ForOp &forOp, Value mask) const {
     MaskInfo maskInfo = getMaskInfo(forOp, mask);
-    assert(hasCanonicalUpperBound(forOp, maskInfo) &&
-           "Loop upper bound not in canonical form");
+    if (!hasCanonicalUpperBound(forOp, maskInfo))
+      return nullptr;
 
     OpBuilder builder(forOp);
     Location loc = forOp.getLoc();
@@ -113,12 +115,13 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
 
     // The loop UB is a constant.
     if (isa<arith::ConstantIntOp>(defOp)) {
-      int64_t valN =
+      int64_t UB = cast<arith::ConstantIntOp>(defOp).value();
+      int64_t N =
           cast<arith::ConstantIntOp>(maskInfo.N.getDefiningOp()).value();
-      bool cond1 = ((valN + maskInfo.END - 1) % maskInfo.END) > 0;
-      bool cond2 = valN > maskInfo.END;
-      return builder.create<arith::ConstantIntOp>(
-          forOp.getLoc(), cond1 && cond2, builder.getI1Type());
+      unsigned END = maskInfo.END;
+      bool cond = UB <= ((N + END - 1) / END) - 1;
+      return builder.create<arith::ConstantIntOp>(forOp.getLoc(), cond,
+                                                  builder.getI1Type());
     }
 
     auto divOp = cast<arith::DivSIOp>(defOp);
@@ -137,6 +140,8 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
     return builder.create<arith::AndIOp>(loc, cmp1, cmp2);
   }
 
+  virtual std::string getName() const { return "CanonicalMaskValidator"; }
+
   // Ensure the loop upper bound is in canonical form (N+END-1)/END.
   static bool hasCanonicalUpperBound(scf::ForOp &forOp,
                                      const MaskInfo &maskInfo) {
@@ -148,10 +153,10 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
     // If the loop UB is constant, use `MaskInfo` to determine whether the UB
     // was folded from a canonical form.
     if (isa<arith::ConstantIntOp>(defOp)) {
-      int64_t valN =
+      int64_t UB = cast<arith::ConstantIntOp>(defOp).value();
+      int64_t N =
           cast<arith::ConstantIntOp>(maskInfo.N.getDefiningOp()).value();
-      return ((valN + maskInfo.END - 1) / maskInfo.END) ==
-             cast<arith::ConstantIntOp>(defOp).value();
+      return UB == ((N + maskInfo.END - 1) / maskInfo.END) - 1;
     }
 
     if (!isa<arith::DivSIOp>(defOp))
@@ -279,6 +284,8 @@ class InvariantMaskValidator final : public MaskValidatorBase {
     llvm_unreachable("Unexpected mask");
     return {};
   }
+
+  virtual std::string getName() const { return "InvariantMaskValidator"; }
 };
 
 // Collects masked operations in a loop that satisfy the condition imposed by
@@ -300,7 +307,8 @@ template <typename MaskValidator> class MaskedOpsCollector {
             maskValidator.isValidMask(forOp, tt::intel::getFinalValue(mask))) {
           maskedOps.insert(op);
           LLVM_DEBUG(llvm::dbgs()
-                     << "Collected masked operation: " << *op << "\n");
+                     << maskValidator.getName()
+                     << ": collected masked operation: " << *op << "\n");
         }
       }
     };