[LoadStoreOpToLLVM] Add support of boundary check (#4701)

whitneywhtsang · web-flow · commit e42361776145 · 2025-07-14T09:10:42.000+08:00
Prior to this PR, `StoreOpToBlockIOConversion` did not account for
boundary checks, assuming they were always provided. This PR modifies
`StoreOpToBlockIOConversion` to properly incorporate boundary checks.
The implementation prevents triggering hardware boundary protection by
expanding the base shape when boundary checks are not provided.

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir
@@ -1,18 +1,12 @@
 // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
-#dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}>
-#dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
-  tt.func public @matmul_no_scf_with_advance_kernel(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64) {
+  tt.func public @matmul_no_scf_with_advance_kernel(%base: !tt.ptr<f16>, %width: i64, %height: i64, %rowStride: i64) {
     %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf16, #dpas>
-    %c32_i32 = arith.constant 32 : i32
-    %c-64_i32 = arith.constant -64 : i32
-    %c-32_i32 = arith.constant -32 : i32
-    %c64_i32 = arith.constant 64 : i32
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
-    %13 = tt.make_tensor_ptr %arg2, [%arg3, %arg5], [%arg6, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xf16, #dpas>>
+    %0 = tt.make_tensor_ptr %base, [%width, %height], [%rowStride, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xf16, #dpas>>
     // CHECK: %[[WARP_ID:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
     // CHECK: %[[offsetBaseY:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK: %[[offsetBaseX:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
@@ -42,7 +36,58 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32,
     // CHECK: llvm.mlir.undef : vector<8xf16>
     // CHECK-COUNT-8: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xf16>
     // CHECK: triton_gen.2Dblockstore {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X]], {{.*}} {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default}
-    tt.store %13, %cst {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<64x64xf16, #dpas>>
+    tt.store %0, %cst {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<64x64xf16, #dpas>>
+    tt.return
+  }
+}
+
+// -----
+
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
+  tt.func public @no_boundary_check(%base: !tt.ptr<f16>, %width: i64, %height: i64, %rowStride: i64) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf16, #dpas>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i64 = arith.constant 1 : i64
+    %0 = tt.make_tensor_ptr %base, [%width, %height], [%rowStride, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xf16, #dpas>>
+
+    // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK: %[[WARP_ID:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
+
+    // CHECK: %[[offsetBaseY:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK: %[[offsetBaseX:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK: %[[baseHeight:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK: %[[baseWidth:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK: %[[rowStride:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK: %[[colStride:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK: %[[base:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+
+    // CHECK: %[[rowStride_i32:.*]] = llvm.trunc %[[rowStride]] : i64 to i32
+    // CHECK: %[[PITCH:.*]] = llvm.mul %[[rowStride_i32]], %[[C2]]
+    // CHECK-COUNT-32: llvm.extractvalue {{.*}} : !llvm.struct<(f16, f16, {{.*}})>
+
+    // COM: Skip the register, lane, warp and block to the offset computation which should be covered by the LL tests.
+    // CHECK: %[[OFFSET_X:.*]] = llvm.add %[[offsetBaseX]], {{.*}} : i32
+    // CHECK: %[[OFFSET_Y:.*]] = llvm.add %[[offsetBaseY]], {{.*}} : i32
+
+    // COM: When boundary check is absent:
+    // CHECK: %[[baseWidth:.*]] = llvm.mlir.constant(64 : i32)
+    // CHECK: %[[base1:.*]] = llvm.getelementptr %[[base]][%[[OFFSET_X]]] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i16
+    // CHECK: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[baseHeight:.*]] = llvm.mlir.constant(8 : i32)
+    // CHECK: %[[OFF:.*]] = llvm.mul %[[OFFSET_Y]], %[[PITCH]] : i32
+    // CHECK: %[[base:.*]] = llvm.getelementptr %[[base1]][%[[OFF]]] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8
+    // CHECK: %[[OFFSET_Y:.*]] = llvm.mlir.constant(0 : i32) : i32
+
+    // CHECK: llvm.mlir.undef : vector<8xf16>
+    // CHECK-COUNT-7: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xf16>
+    // CHECK: %[[VAL0:.*]] = llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xf16>
+    // CHECK: %[[VAL:.*]] = llvm.bitcast %[[VAL0]] : vector<8xf16> to vector<8xi16>
+
+    // CHECK: triton_gen.2Dblockstore %[[base]], %[[baseWidth]], %[[baseHeight]], %[[PITCH]], %[[OFFSET_X]], %[[OFFSET_Y]], %[[VAL]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default}
+    // CHECK-COUNT-3: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default}
+
+    tt.store %0, %cst {ttig.block_io = "row_major"} : !tt.ptr<tensor<64x64xf16, #dpas>>
     tt.return
   }
 }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -2628,6 +2628,7 @@ struct StoreOpToBlockIOConversion
 
     width = b.trunc(i32_ty, width);
     rowStride = b.trunc(i32_ty, rowStride);
+    Value addrElem = base;
     // encoded as bytes.
     Value baseWidth = b.mul(width, elemSizeInBytes);
     Value baseHeight = b.trunc(i32_ty, height);
@@ -2674,6 +2675,25 @@ struct StoreOpToBlockIOConversion
       Value offsetX = b.add(offsetBaseX, offsets[colDim].second);
       Value offsetY = b.add(offsetBaseY, offsets[rowDim].second);
 
+      // To prevent triggering hardware boundary protection, expand the base
+      // shape sufficiently when boundary check is absent.
+      SetVector<unsigned> boundaryCheck(op.getBoundaryCheck().begin(),
+                                        op.getBoundaryCheck().end());
+      if (!boundaryCheck.contains(colDim)) {
+        baseWidth = b.i32_val(
+            std::max(64u, vBlocks * tileWidth * (elemSizeInBits / 8)));
+        // Use opaqueType as offsetX is in number of elements.
+        addrElem = b.gep(ptr_ty(ctx, 1), opaqueType, addrElem, offsetX);
+        offsetX = b.i32_val(0);
+      }
+      if (!boundaryCheck.contains(rowDim)) {
+        baseHeight = b.i32_val(tileHeight);
+        // Use i8_ty as pitch is in number of bytes.
+        Value off = b.mul(offsetY, pitch);
+        addrElem = b.gep(ptr_ty(ctx, 1), i8_ty, addrElem, off);
+        offsetY = b.i32_val(0);
+      }
+
       // Compose the matrix by stacking the name into vector.
       Value storeVal = rewriter.create<LLVM::UndefOp>(
           loc,
@@ -2684,7 +2704,7 @@ struct StoreOpToBlockIOConversion
 
       auto newOp = rewriter.create<TritonGEN::Matrix2DBlockStoreOp>(
           loc,
-          /*ptr*/ base,
+          /*ptr*/ addrElem,
           /*base_width*/ baseWidth,
           /*base_height*/ baseHeight,
           /*base_pitch*/ pitch,