Always do boundary check on Tensor-Descriptor lowering (#4303)

dchigarev · web-flow · commit 59b7b2520c96 · 2025-05-26T14:23:51.000-04:00
Closes #4137 #4140 #4221 The PR fixes failures in several tensor descriptor tests. The reason for the failure was that the `tensor_descriptor.store` operation was going out-of-bounds too far and was overwriting the reference array that was allocated nearby. The [`RewriteTensorDescriptorToPointer` pass](https://github.com/triton-lang/triton/blob/main/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp) in upstream always generates boundary checks for both [stores](https://github.com/triton-lang/triton/blob/09dc29800e918d5f5c8df4279d124f51e0a94987/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp#L286) and [loads](https://github.com/triton-lang/triton/blob/09dc29800e918d5f5c8df4279d124f51e0a94987/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp#L263) using a boolean mask. The intel-specific lowering performs the following conversion without adding any boundary checks: ``` tensor_descriptor --(without bound-check)--> tensor_pointer --(without bound-check)--> llvm.load/store ``` The boolean mask generation based on the tensor shape is [supposed to happen](https://github.com/intel/intel-xpu-backend-for-triton/blob/ae46511660d6a699132c67387394f511e825c90f/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp#L2558-L2564) at the `LoadStoreOpToLLVM` conversion pass. The code [relies on the `boundaryCheck`](https://github.com/intel/intel-xpu-backend-for-triton/blob/ae46511660d6a699132c67387394f511e825c90f/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp#L2564) attribute of `tt.StoreOp` that is not set during `tensor_descriptor --> tensor_pointer` conversion. This PR fixes the problem and adds a `boundaryCheck` attribute for every load/store operation in the `TensorDescToBlockPointer` pass --------- Signed-off-by: dchigarev <dmitry.chigarev@intel.com>
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -209,8 +209,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 @pytest.mark.parametrize("dtype_str", tma_dtypes)
 @pytest.mark.parametrize("K_BLOCK", [16, 32, 64, 128])
 def test_tensor_descriptor_store3d(dtype_str, K_BLOCK, device):
-    if is_xpu() and dtype_str == 'bfloat16':
-        pytest.skip("FIXME: issue #4137")
 
     @triton.jit
     def kernel(out_ptr, a_ptr, M, N, K, stride_m, stride_n, stride_k, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr,
@@ -329,8 +327,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 def test_tensor_descriptor_store_nd(dtype_str, num_ctas, ndim, INNER_BLOCK, device):
     if num_ctas == 2 and (not is_cuda() or torch.cuda.get_device_capability(0)[0] not in (9, 10)):
         pytest.xfail("CTAs is unsupported for these cards")
-    if is_xpu() and ndim not in [1]:
-        pytest.skip("FIXME: issue #4140")
 
     @triton.jit
     def kernel(out_ptr, a_ptr, shape, strides, BLOCK_SHAPE):
@@ -926,8 +922,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 @pytest.mark.parametrize("ndim", [3, 4, 5])
 @pytest.mark.parametrize("INNER_BLOCK", [16, 32, 64, 128])
 def test_tensor_descriptor_rank_reducing_load(dtype_str, ndim, INNER_BLOCK, device):
-    if is_xpu():
-        pytest.skip("FIXME: issue #4221")
 
     @triton.jit
     def kernel(out_ptr, a_ptr, shape, strides, BLOCK_SHAPE):
diff --git a/test/Triton/Intel/TensorDescToBlockPointer/basic.mlir b/test/Triton/Intel/TensorDescToBlockPointer/basic.mlir
@@ -19,7 +19,7 @@ module {
   // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
   // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
-  // CHECK:        [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<16x128xf32>>
   // CHECK:        tt.return
   // CHECK:      }
 
@@ -43,7 +43,7 @@ module {
   // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
   // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
-  // CHECK:        tt.store [[TENSOR_PTR]], [[CST]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        tt.store [[TENSOR_PTR]], [[CST]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<16x128xf32>>
   // CHECK:        tt.return
   // CHECK:      }
 }
diff --git a/test/Triton/Intel/TensorDescToBlockPointer/loop.mlir b/test/Triton/Intel/TensorDescToBlockPointer/loop.mlir
@@ -31,7 +31,7 @@ module {
   // CHECK-DAG:      [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:      [[EXTSI_PARAM_2b:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
   // CHECK:          [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2b]]], {{\[}}[[EXTSI_PARAM_2a]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[IDX_CAST_1]]] {{.*}} : <tensor<16x32xf16>>
-  // CHECK:          [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] : !tt.ptr<tensor<16x32xf16>>
+  // CHECK:          [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<16x32xf16>>
   // CHECK:          [[ADD:%.+]] = arith.addf [[VAR_arg2]], [[LOAD]] : tensor<16x32xf16>
   // CHECK:          scf.yield {{.*}}, [[ADD]] : !tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>
   // CHECK:        }
@@ -124,7 +124,7 @@ module {
   // CHECK-DAG:      [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:      [[EXTSI_PARAM_2b:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
   // CHECK:          [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2b]]], {{\[}}[[EXTSI_PARAM_2a]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[IDX_CAST_1]]] {{.*}} : <tensor<16x32xf16>>
-  // CHECK:          tt.store [[TENSOR_PTR]], [[VAR_arg2]] : !tt.ptr<tensor<16x32xf16>>
+  // CHECK:          tt.store [[TENSOR_PTR]], [[VAR_arg2]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<16x32xf16>>
   // CHECK:          [[ADD:%.+]] = arith.addf [[VAR_arg2]], [[CST]] : tensor<16x32xf16>
   // CHECK:          scf.yield {{.*}}, [[ADD]] : !tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>
   // CHECK:        }
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp
@@ -202,17 +202,21 @@ struct TritonIntelTensorDescToBlockPointer
       llvm::dbgs().indent(2) << makeTensorPtrOp << "\n";
     });
 
+    SmallVector<int32_t> boundaryCheck;
+    for (size_t i = 0; i < makeTensorDescOp.getShape().size(); ++i)
+      boundaryCheck.push_back(i);
     constexpr bool isLoad = std::is_same_v<OpTy, tt::DescriptorLoadOp>;
     if constexpr (isLoad) {
       auto loadOp = builder.createOrFold<tt::LoadOp>(
-          loc, makeTensorPtrOp, op.getCache(), op.getEvict(),
+          loc, makeTensorPtrOp, boundaryCheck, /*padding*/ std::nullopt,
+          op.getCache(), op.getEvict(),
           /*volatile*/ false);
       LLVM_DEBUG(llvm::dbgs().indent(2) << loadOp << "\n");
       op.replaceAllUsesWith(loadOp);
     } else {
       [[maybe_unused]] auto storeOp = builder.createOrFold<tt::StoreOp>(
-          loc, makeTensorPtrOp, op.getSrc(), tt::CacheModifier::NONE,
-          tt::EvictionPolicy::NORMAL);
+          loc, makeTensorPtrOp, op.getSrc(), boundaryCheck,
+          tt::CacheModifier::NONE, tt::EvictionPolicy::NORMAL);
       LLVM_DEBUG(llvm::dbgs().indent(2) << storeOp << "\n");
     }