Avoid pass failure for descriptor load/store operations using a tensor descr. allocated on the host (#4812)

etiotto · web-flow · commit 4afd331f020b · 2025-07-31T15:48:46.000Z
When a tensor descriptor is allocated on the host and passed to a triton
kernel as a argument, the pass that transforms descriptor load/store
operations into equivalent block pointer operations currently fails.
This PR rectifies the situation and allow the pass to complete so that
subsequently those operations can be lowered to use the 'unwrapped'
descriptor argument passed to the kernel.

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/Triton/Intel/TensorDescToBlockPointer/invalid.mlir b/test/Triton/Intel/TensorDescToBlockPointer/invalid.mlir
@@ -1,13 +1,23 @@
 // RUN: triton-opt %s -triton-intel-tdesc-to-block-pointer  | FileCheck %s
 
-// COM: Test make_tensor_descriptor is not rewritten when it is used by descriptor_gather.
-// CHECK-NOT: make_tensor_ptr
-// CHECK: tt.make_tensor_descriptor
-module {
-  tt.func public @test_descriptor_gather(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: tensor<32xi32>, %arg3: i32) {
-    %c128_i32 = arith.constant 128 : i32
-    %0 = tt.make_tensor_descriptor %arg0, [%c128_i32, %c128_i32], [%arg1, %arg1] : <f32>, <tensor<1x32xf32>>
-    %1 = tt.descriptor_gather %0[%arg2, %arg3] : (!tt.tensordesc<tensor<1x32xf32>>, tensor<32xi32>, i32) -> tensor<32x32xf32>
-    tt.return
-  }
+// COM: Test that `make_tensor_descriptor` is not rewritten when it is used by `descriptor_gather`.
+tt.func public @test_descriptor_gather(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: tensor<32xi32>, %arg3: i32) {
+  // CHECK-NOT: make_tensor_ptr
+  // CHECK: tt.make_tensor_descriptor
+ %c128_i32 = arith.constant 128 : i32
+  %0 = tt.make_tensor_descriptor %arg0, [%c128_i32, %c128_i32], [%arg1, %arg1] : <f32>, <tensor<1x32xf32>>
+  %1 = tt.descriptor_gather %0[%arg2, %arg3] : (!tt.tensordesc<tensor<1x32xf32>>, tensor<32xi32>, i32) -> tensor<32x32xf32>
+  tt.return
+}
+
+// COM: Test that `descriptor_load/descriptor_store` operations are not rewritten if it they use a tensor descriptor function arg.
+tt.func public @test_host_descriptor(%desc: !tt.tensordesc<tensor<2x16xf16>>) {
+  // CHECK: tt.func public @test_host_descriptor([[DESC:%.*]]: !tt.tensordesc<tensor<2x16xf16>>) {
+  // CHECK: tt.descriptor_load [[DESC]]
+  // CHECK: tt.descriptor_store [[DESC]]
+  %c2_i32 = arith.constant 2 : i32
+  %c32_i32 = arith.constant 32 : i32
+  %0 = tt.descriptor_load %desc[%c2_i32, %c32_i32] : !tt.tensordesc<tensor<2x16xf16>> -> tensor<2x16xf16>
+  tt.descriptor_store %desc[%c2_i32, %c32_i32], %0 : !tt.tensordesc<tensor<2x16xf16>>, tensor<2x16xf16>
+  tt.return
 }
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp
@@ -232,18 +232,24 @@ struct TritonIntelTensorDescToBlockPointer
                              bool> = true>
   LogicalResult rewriteDescriptorLoadOrStoreOp(OpTy op) {
     assert(op && "Expecting a valid operation");
+
+    // At this point we expect to have transformed `make_tensor_descriptor` into
+    // a `make_block_ptr` operation, except when the tensor descriptor is
+    // allocated on the host and passed to the kernel as an argument.
+    Value operand = op.getOperand(0);
+    if (isa<tt::TensorDescType>(operand.getType()))
+      return failure();
+
     LLVM_DEBUG(llvm::dbgs() << "Rewriting: " << op << "\n");
 
     OpBuilder builder(op);
     Location loc = op.getLoc();
-    Value ptr = op.getOperand(0);
-    assert(triton::isTensorPointerType(ptr.getType()) &&
+    assert(triton::isTensorPointerType(operand.getType()) &&
            "Expecting a block ptr");
-    auto ptrType = cast<tt::PointerType>(ptr.getType());
+    auto ptrType = cast<tt::PointerType>(operand.getType());
     auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
-
-    ptr =
-        builder.create<tt::AdvanceOp>(loc, ptr.getType(), ptr, op.getIndices());
+    Value ptr =
+        builder.create<tt::AdvanceOp>(loc, ptrType, operand, op.getIndices());
 
     SmallVector<int32_t> boundaryCheck;
     for (size_t i = 0; i < tensorType.getRank(); ++i)