Increase test coverage for the RemoveMask pass (#3733)

etiotto · web-flow · commit 77b8626ee5f7 · 2025-03-25T09:36:51.000-04:00
Add unit tests for the `RemoveMask` transformation.

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/Triton/Intel/RemoveMasks/loop-canonical-masks.mlir b/test/Triton/Intel/RemoveMasks/loop-canonical-masks.mlir
@@ -0,0 +1,111 @@
+// RUN: triton-opt %s -triton-intel-remove-masks | FileCheck %s
+
+module {
+  tt.func public @test_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) {
+    %c31_i32 = arith.constant 31 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x128xf32>
+    %c127_i32 = arith.constant 127 : i32
+    %c63_i32 = arith.constant 63 : i32
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x128xf16>
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x32xf16>
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<32> : tensor<64x32xi32>
+    %c32_i32 = arith.constant 32 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.addi %arg3, %c63_i32 : i32
+    %2 = arith.divsi %1, %c64_i32 : i32
+    %3 = arith.addi %arg4, %c127_i32 : i32
+    %4 = arith.divsi %3, %c128_i32 : i32
+    %5 = arith.muli %4, %c4_i32 : i32
+    %6 = arith.divsi %0, %5 : i32
+    %7 = arith.muli %6, %c4_i32 : i32
+    %8 = arith.subi %2, %7 : i32
+    %9 = arith.minsi %8, %c4_i32 : i32
+    %10 = arith.remsi %0, %5 : i32
+    %11 = arith.remsi %10, %9 : i32
+    %12 = arith.addi %7, %11 : i32
+    %13 = arith.divsi %10, %9 : i32
+    %14 = arith.muli %12, %c64_i32 : i32
+    %15 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %16 = tt.splat %14 : i32 -> tensor<64xi32>
+    %17 = arith.addi %16, %15 : tensor<64xi32>
+    %18 = tt.splat %arg3 : i32 -> tensor<64xi32>
+    %19 = arith.remsi %17, %18 : tensor<64xi32>
+    %20 = arith.muli %13, %c128_i32 : i32
+    %21 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %22 = tt.splat %20 : i32 -> tensor<128xi32>
+    %23 = arith.addi %22, %21 : tensor<128xi32>
+    %24 = tt.splat %arg4 : i32 -> tensor<128xi32>
+    %25 = arith.remsi %23, %24 : tensor<128xi32>
+    %26 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
+    %27 = tt.expand_dims %19 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32>
+    %28 = tt.splat %arg6 : i32 -> tensor<64x1xi32>
+    %29 = arith.muli %27, %28 : tensor<64x1xi32>
+    %30 = tt.expand_dims %26 {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32>
+    %31 = tt.broadcast %29 : tensor<64x1xi32> -> tensor<64x32xi32>
+    %32 = tt.broadcast %30 : tensor<1x32xi32> -> tensor<64x32xi32>
+    %33 = arith.addi %31, %32 : tensor<64x32xi32>
+    %34 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<64x32x!tt.ptr<f16>>
+    %35 = tt.addptr %34, %33 : tensor<64x32x!tt.ptr<f16>>, tensor<64x32xi32>
+    %36 = tt.expand_dims %26 {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32>
+    %37 = tt.splat %arg7 : i32 -> tensor<32x1xi32>
+    %38 = arith.muli %36, %37 : tensor<32x1xi32>
+    %39 = tt.expand_dims %25 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32>
+    %40 = tt.broadcast %38 : tensor<32x1xi32> -> tensor<32x128xi32>
+    %41 = tt.broadcast %39 : tensor<1x128xi32> -> tensor<32x128xi32>
+    %42 = arith.addi %40, %41 : tensor<32x128xi32>
+    %43 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<32x128x!tt.ptr<f16>>
+    %44 = tt.addptr %43, %42 : tensor<32x128x!tt.ptr<f16>>, tensor<32x128xi32>
+    %45 = arith.addi %arg5, %c31_i32 : i32
+    %46 = arith.divsi %45, %c32_i32 : i32
+    %47 = arith.muli %arg7, %c32_i32 : i32
+    %48 = tt.splat %47 : i32 -> tensor<32x128xi32>
+    %49:3 = scf.for %arg9 = %c0_i32 to %46 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %35, %arg12 = %44) -> (tensor<64x128xf32>, tensor<64x32x!tt.ptr<f16>>, tensor<32x128x!tt.ptr<f16>>)  : i32 {
+      %67 = arith.muli %arg9, %c32_i32 : i32
+      %68 = arith.subi %arg5, %67 : i32
+      %69 = tt.splat %68 : i32 -> tensor<1x32xi32>
+      %70 = arith.cmpi slt, %30, %69 : tensor<1x32xi32>
+      %71 = tt.broadcast %70 : tensor<1x32xi1> -> tensor<64x32xi1>
+      %72 = tt.load %arg11, %71, %cst_1 : tensor<64x32x!tt.ptr<f16>>
+      %73 = tt.splat %68 : i32 -> tensor<32x1xi32>
+      %74 = arith.cmpi slt, %36, %73 : tensor<32x1xi32>
+      %75 = tt.broadcast %74 : tensor<32x1xi1> -> tensor<32x128xi1>
+      %76 = tt.load %arg12, %75, %cst_0 : tensor<32x128x!tt.ptr<f16>>
+      %77 = tt.dot %72, %76, %arg10, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x128xf16> -> tensor<64x128xf32>
+      %78 = tt.addptr %arg11, %cst_2 : tensor<64x32x!tt.ptr<f16>>, tensor<64x32xi32>
+      %79 = tt.addptr %arg12, %48 : tensor<32x128x!tt.ptr<f16>>, tensor<32x128xi32>
+      scf.yield %77, %78, %79 : tensor<64x128xf32>, tensor<64x32x!tt.ptr<f16>>, tensor<32x128x!tt.ptr<f16>>
+    }
+    %50 = arith.truncf %49#0 : tensor<64x128xf32> to tensor<64x128xf16>
+    tt.return
+  }
+
+  // CHECK:         tt.func public @test_kernel([[PARAM_0_:%.+]]: !tt.ptr<f16> {tt.divisibility = 16 : i32}, [[PARAM_1_:%.+]]: !tt.ptr<f16> {tt.divisibility = 16 : i32}, [[PARAM_2_:%.+]]: !tt.ptr<f16> {tt.divisibility = 16 : i32}, [[PARAM_3_:%.+]]: i32 {tt.divisibility = 16 : i32}, [[PARAM_4_:%.+]]: i32 {tt.divisibility = 16 : i32}, [[PARAM_5_:%.+]]: i32 {tt.divisibility = 16 : i32}, [[PARAM_6_:%.+]]: i32 {tt.divisibility = 16 : i32}, [[PARAM_7_:%.+]]: i32 {tt.divisibility = 16 : i32}, [[PARAM_8_:%.+]]: i32 {tt.divisibility = 16 : i32}) {
+  // CHECK:           [[CST_0_i32:%.+]] = arith.constant 0 : i32
+  // CHECK:           [[CST_32_i32:%.+]] = arith.constant 32 : i32
+  // CHECK:           [[REM:%.+]] = arith.remsi [[PARAM_5_]], [[CST_32_i32]] : i32
+  // CHECK:           [[CMP1:%.+]] = arith.cmpi eq, [[REM]], [[CST_0_i32]] : i32
+  // CHECK:           [[CMP2:%.+]] = arith.cmpi sgt, [[PARAM_5_]], [[CST_32_i32]] : i32
+  // CHECK:           [[VER_COND:%.+]] = arith.andi [[CMP1]], [[CMP2]] : i1
+  // CHECK:           [[LOOP_VER:%.+]] = scf.if [[VER_COND]] -> (tensor<64x128xf32>) {
+  // CHECK:             [[THEN_LOOP_RES:%.+]]:3 = scf.for {{.*}} iter_args([[VAR_arg10:%.+]] = {{.*}}, [[VAR_arg11:%.+]] = {{.*}}, [[VAR_arg12:%.+]] = {{.*}}) -> (tensor<64x128xf32>, tensor<64x32x!tt.ptr<f16>>, tensor<32x128x!tt.ptr<f16>>) : i32 {
+  // CHECK:               [[LOAD_A1:%.+]] = tt.load [[VAR_arg11]] : tensor<64x32x!tt.ptr<f16>>
+  // CHECK:               [[LOAD_B2:%.+]] = tt.load [[VAR_arg12]] : tensor<32x128x!tt.ptr<f16>>
+  // CHECK:               scf.yield {{.*}}, {{.*}}, {{.*}} : tensor<64x128xf32>, tensor<64x32x!tt.ptr<f16>>, tensor<32x128x!tt.ptr<f16>>
+  // CHECK:             }
+  // CHECK:             scf.yield [[THEN_LOOP_RES]]#0 : tensor<64x128xf32>
+  // CHECK:           } else {
+  // CHECK:             [[ELSE_LOOP_RES:%.+]]:3 = scf.for {{.*}} iter_args([[VAR_arg10:%.+]] = {{.*}}, [[VAR_arg11:%.+]] = {{.*}}, [[VAR_arg12:%.+]] = {{.*}}) -> (tensor<64x128xf32>, tensor<64x32x!tt.ptr<f16>>, tensor<32x128x!tt.ptr<f16>>) : i32 {
+  // CHECK:               [[LOAD_A2:%.+]] = tt.load [[VAR_arg11]], {{.*}}, {{.*}} : tensor<64x32x!tt.ptr<f16>>
+  // CHECK:               [[LOAD_B2:%.+]] = tt.load [[VAR_arg12]], {{.*}}, {{.*}} : tensor<32x128x!tt.ptr<f16>>
+  // CHECK:               scf.yield {{.*}}, {{.*}}, {{.*}} : tensor<64x128xf32>, tensor<64x32x!tt.ptr<f16>>, tensor<32x128x!tt.ptr<f16>>
+  // CHECK:             }
+  // CHECK:             scf.yield [[ELSE_LOOP_RES]]#0 : tensor<64x128xf32>
+  // CHECK:           }
+  // CHECK:           tt.return
+  // CHECK:         }
+}
diff --git a/test/Triton/Intel/RemoveMasks/loop-invariant-masks.mlir b/test/Triton/Intel/RemoveMasks/loop-invariant-masks.mlir
@@ -0,0 +1,109 @@
+// RUN: triton-opt %s -triton-intel-remove-masks | FileCheck %s
+
+module {
+  // COM: test loop versioning for loads with invariant mask operations.
+  // COM: masks in form [0..END] < splat(X)
+  tt.func public @test_invariant_masks_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32, %arg2: i32, %arg3: i32) {
+    %cst = arith.constant dense<0xFF800000> : tensor<1024xf32>
+    %cst_0 = arith.constant dense<0xFF800000> : tensor<512xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_num_programs x : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %arg3 : i32 -> tensor<1024xi32>
+    %4 = arith.cmpi slt, %2, %3 : tensor<1024xi32>
+    %5 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %6 = tt.splat %arg3 : i32 -> tensor<512xi32>
+    %7 = arith.cmpi slt, %5, %6 : tensor<512xi32>
+    scf.for %arg6 = %0 to %arg2 step %1  : i32 {
+      %8 = arith.muli %arg6, %arg1 : i32
+      %9 = tt.addptr %arg0, %8 : !tt.ptr<f32>, i32
+      %10 = tt.splat %9 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+      %11 = tt.splat %9 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>>
+      %12 = tt.addptr %10, %2 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+      %13 = tt.addptr %11, %5 : tensor<512x!tt.ptr<f32>>, tensor<512xi32>
+      %14 = tt.load %12, %4, %cst : tensor<1024x!tt.ptr<f32>>
+      %15 = tt.load %13, %7, %cst_0 : tensor<512x!tt.ptr<f32>>
+    }
+    tt.return
+  }
+
+  // CHECK:         tt.func public @test_invariant_masks_1([[PARAM_0:%.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32, [[PARAM_3:%.+]]: i32) {
+  // CHECK:           [[VAR_2:%.+]] = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+  // CHECK:           [[VAR_3:%.+]] = tt.splat [[PARAM_3]] : i32 -> tensor<1024xi32>
+  // CHECK:           [[VAR_4:%.+]] = arith.cmpi slt, [[VAR_2]], [[VAR_3]] : tensor<1024xi32>
+  // CHECK:           [[VAR_5:%.+]] = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+  // CHECK:           [[VAR_6:%.+]] = tt.splat [[PARAM_3]] : i32 -> tensor<512xi32>
+  // CHECK-DAG:       [[VAR_7:%.+]] = arith.cmpi slt, [[VAR_5]], [[VAR_6]] : tensor<512xi32>
+  // CHECK-DAG:       [[CST_1023:%.+]] = arith.constant 1023 : i32
+  // CHECK:           [[VAR_8:%.+]] = arith.cmpi sgt, [[PARAM_3]], [[CST_1023]] : i32
+  // CHECK:           [[CST_511:%.+]] = arith.constant 511 : i32
+  // CHECK:           [[VAR_9:%.+]] = arith.cmpi sgt, [[PARAM_3]], [[CST_511]] : i32
+  // CHECK:           [[VAR_10:%.+]] = arith.andi [[VAR_8]], [[VAR_9]] : i1
+  // CHECK:           scf.if [[VAR_10]] {
+  // CHECK:             scf.for {{.+}} = {{.+}} to {{.+}} step {{.+}} : i32 {
+  // CHECK-NOT:           tt.load {{.*}}, {{.*}}, {{.*}}
+  // CHECK:             }
+  // CHECK:           } else {
+  // CHECK:             scf.for {{.+}} = {{.+}} to {{.+}} step {{.+}} : i32 {
+  // CHECK-DAG:           [[LOAD_A2:%.+]] = tt.load {{.*}}, [[VAR_4]], {{.*}} : tensor<1024x!tt.ptr<f32>>
+  // CHECK-DAG:           [[LOAD_B2:%.+]] = tt.load {{.*}}, [[VAR_7]], {{.*}} : tensor<512x!tt.ptr<f32>>
+  // CHECK:             }
+  // CHECK:           }
+  // CHECK:           tt.return
+  // CHECK:         }
+}
+
+// -----
+
+module {
+  // COM: test loop versioning for loads with invariant mask operations.
+  // COM: masks in form splat(X) < [0..END]
+  tt.func public @test_invariant_masks_2(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32, %arg2: i32, %arg3: i32) {
+    %cst = arith.constant dense<0xFF800000> : tensor<1024xf32>
+    %cst_0 = arith.constant dense<0xFF800000> : tensor<512xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_num_programs x : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %arg3 : i32 -> tensor<1024xi32>
+    %4 = arith.cmpi slt, %3, %2 : tensor<1024xi32>
+    %5 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %6 = tt.splat %arg3 : i32 -> tensor<512xi32>
+    %7 = arith.cmpi slt, %6, %5 : tensor<512xi32>
+    scf.for %arg6 = %0 to %arg2 step %1  : i32 {
+      %8 = arith.muli %arg6, %arg1 : i32
+      %9 = tt.addptr %arg0, %8 : !tt.ptr<f32>, i32
+      %10 = tt.splat %9 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+      %11 = tt.splat %9 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>>
+      %12 = tt.addptr %10, %2 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+      %13 = tt.addptr %11, %5 : tensor<512x!tt.ptr<f32>>, tensor<512xi32>
+      %14 = tt.load %12, %4, %cst : tensor<1024x!tt.ptr<f32>>
+      %15 = tt.load %13, %7, %cst_0 : tensor<512x!tt.ptr<f32>>
+    }
+    tt.return
+  }
+
+  // CHECK:         tt.func public @test_invariant_masks_2([[PARAM_0:%.+]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32, [[PARAM_3:%.+]]: i32) {
+  // CHECK:           [[VAR_2:%.+]] = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+  // CHECK:           [[VAR_3:%.+]] = tt.splat [[PARAM_3]] : i32 -> tensor<1024xi32>
+  // CHECK:           [[VAR_4:%.+]] = arith.cmpi slt, [[VAR_3]], [[VAR_2]] : tensor<1024xi32>
+  // CHECK:           [[VAR_5:%.+]] = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+  // CHECK:           [[VAR_6:%.+]] = tt.splat [[PARAM_3]] : i32 -> tensor<512xi32>
+  // CHECK-DAG:       [[VAR_7:%.+]] = arith.cmpi slt, [[VAR_6]], [[VAR_5]] : tensor<512xi32>
+  // CHECK-DAG:       [[CST_0_i32:%.+]] = arith.constant 0 : i32
+  // CHECK-DAG:       [[VAR_8:%.+]] = arith.cmpi slt, [[PARAM_3]], [[CST_0_i32]] : i32
+  // CHECK-DAG:       [[CST_0_1_i32:%.+]] = arith.constant 0 : i32
+  // CHECK-DAG:       [[VAR_9:%.+]] = arith.cmpi slt, [[PARAM_3]], [[CST_0_1_i32]] : i32
+  // CHECK:           [[VAR_10:%.+]] = arith.andi [[VAR_8]], [[VAR_9]] : i1
+  // CHECK:           scf.if [[VAR_10]] {
+  // CHECK:             scf.for {{.+}} = {{.+}} to {{.+}} step {{.+}} : i32 {
+  // CHECK-NOT:           tt.load {{.*}}, {{.*}}, {{.*}}
+  // CHECK:             }
+  // CHECK:           } else {
+  // CHECK:             scf.for {{.+}} = {{.+}} to {{.+}} step {{.+}} : i32 {
+  // CHECK-DAG:           [[LOAD_A2:%.+]] = tt.load {{.*}}, [[VAR_4]], {{.*}} : tensor<1024x!tt.ptr<f32>>
+  // CHECK-DAG:           [[LOAD_B2:%.+]] = tt.load {{.*}}, [[VAR_7]], {{.*}} : tensor<512x!tt.ptr<f32>>
+  // CHECK:             }
+  // CHECK:           }
+  // CHECK:           tt.return
+  // CHECK:         }
+}
diff --git a/third_party/intel/include/Utils/Utility.h b/third_party/intel/include/Utils/Utility.h
@@ -1,10 +1,17 @@
 #ifndef TRITON_INTEL_UTILS_UTILITY_H
 #define TRITON_INTEL_UTILS_UTILITY_H
 
-#include <mlir/IR/Value.h>
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Value.h"
 
 namespace mlir::triton::intel {
 
+// Lookup for a integer constant with the given value and bitwidth in the
+// current block (before the builder insertion point). Return it if found,
+// otherwise create a new one.
+Value findOrCreateIntConstant(Location loc, int val, unsigned bitWidth,
+                              OpBuilder &builder);
+
 // This function folds the `op` operation and returns the constant value if it
 // has successfully folded to a constant. Otherwise, it returns `std::nullopt`.
 std::optional<int64_t> getFoldedConstantValue(Operation *op);
@@ -13,7 +20,7 @@ std::optional<int64_t> getFoldedConstantValue(Operation *op);
 // expected.
 bool isConstant(Value val, int64_t expected);
 
-mlir::Value getFinalValue(Value value);
+Value getFinalValue(Value value);
 
 } // namespace mlir::triton::intel
 
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp
@@ -102,8 +102,8 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
 
     OpBuilder builder(forOp);
     Location loc = forOp.getLoc();
-    Value zero =
-        builder.createOrFold<arith::ConstantIntOp>(loc, 0, lhs.getType());
+    Value zero = tt::intel::findOrCreateIntConstant(
+        loc, 0, lhs.getType().getIntOrFloatBitWidth(), builder);
     Value cmp1 = builder.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::eq,
         builder.create<arith::RemSIOp>(loc, lhs, rhs), zero);
@@ -204,18 +204,18 @@ class InvariantMaskValidator final : public MaskValidatorBase {
       return builder.createOrFold<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
                                                  lhsVal, rhsVal);
 
-    // [0..END] < splat(N)
+    // [0..END] < splat(N) -- generate versioning condition 'END-1 < N'.
     if (!rhs && isa<tt::MakeRangeOp>(lhs)) {
       [[maybe_unused]] auto rangeOp = cast<tt::MakeRangeOp>(lhs);
       assert(rangeOp.getStart() < rangeOp.getEnd() && "Invalid range");
-      unsigned end = rangeOp.getEnd();
-      auto cstOp = builder.createOrFold<arith::ConstantIntOp>(loc, end,
-                                                              rhsVal.getType());
+      unsigned end = rangeOp.getEnd() - 1u;
+      auto cstOp = tt::intel::findOrCreateIntConstant(
+          loc, end, rhsVal.getType().getIntOrFloatBitWidth(), builder);
       return builder.createOrFold<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
                                                  cstOp, rhsVal);
     }
 
-    // splat(N) < [0..END]
+    // splat(N) < [0..END] -- generate versioning condition 'N < END'.
     if (!lhs && isa<tt::MakeRangeOp>(rhs)) {
       [[maybe_unused]] auto rangeOp = cast<tt::MakeRangeOp>(rhs);
       assert(rangeOp.getStart() < rangeOp.getEnd() && "Invalid range");
diff --git a/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp b/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp
diff --git a/third_party/intel/lib/Utils/Utility.cpp b/third_party/intel/lib/Utils/Utility.cpp