facebookexperimental
diff --git a/‎README.md‎
Lines changed: 33 additions & 0 deletions b/‎README.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 5 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 137 additions & 1 deletion b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 137 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp‎
Lines changed: 16 additions & 6 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎python/test/unit/language/test_tlx.py‎
Lines changed: 57 additions & 0 deletions b/‎python/test/unit/language/test_tlx.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎test/Hopper/WarpSpecialization/blackwell_ws_data_partition.mlir‎
Lines changed: 5 additions & 5 deletions b/‎test/Hopper/WarpSpecialization/blackwell_ws_data_partition.mlir‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎test/TLX/propagate-layout.mlir‎
Lines changed: 6 additions & 6 deletions b/‎test/TLX/propagate-layout.mlir‎
Lines changed: 6 additions & 6 deletions
@@ -69,6 +69,39 @@ While this approach places more responsibility on the user, it reduces the compi
 
    Store a chunk of data from local memory into global memory buffer. The global address, strides, and buffer size are defined by the memory descriptor.
 
+- `tlx.make_tensor_descriptor(desc_ptr, base, shape, strides, block_shape, padding_option)`
+
+   Create a TMA (Tensor Memory Accelerator) descriptor for efficient asynchronous data movement on Hopper and Blackwell GPUs.
+
+   **Parameters:**
+   - `desc_ptr` (optional): Pointer to global memory for descriptor storage. Pass `None` for automatic allocation.
+   - `base`: Base pointer to the tensor in global memory
+   - `shape`: List of tensor dimensions (dynamic, runtime values)
+   - `strides`: List of tensor strides (dynamic, runtime values)
+   - `block_shape`: Shape of the block to be loaded/stored (compile-time constants)
+   - `padding_option`: Padding option for out-of-bounds accesses (default: "zero")
+
+   **Example:**
+   ```python
+   # Create a 2D tensor descriptor with automatic scratch allocation
+   desc = tlx.make_tensor_descriptor(
+       desc_ptr=None,  # Compiler allocates scratch memory automatically
+       base=tensor_ptr,
+       shape=[M, N],
+       strides=[N, tl.constexpr(1)],
+       block_shape=[64, 64],
+   )
+
+   # Or with explicit scratch allocation for advanced use cases
+   desc_ptr = tlx.global_alloc(nbytes=128, alignment=128)
+   desc = tlx.make_tensor_descriptor(
+       desc_ptr=desc_ptr,
+       base=tensor_ptr,
+       shape=[M, N],
+       strides=[N, tl.constexpr(1)],
+       block_shape=[64, 64],
+   )
+   ```
 
 - `tlx.async_load(tensor_ptr, buffer, optional_mask, optional_other, cache_modifier, eviction_policy, is_volatile)`
 
 
@@ -1054,7 +1054,7 @@ def TT_MakeTensorPtrOp : TT_Op<"make_tensor_ptr",
 //
 def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor", [
     Pure,
-    SameVariadicOperandSize,
+    AttrSizedOperandSegments,
 ]> {
   let summary = "Make a tensor descriptor type with meta information of the parent tensor and block size";
 
@@ -1067,15 +1067,18 @@ def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor", [
     TT_Ptr:$base,
     Variadic<I32>:$shape,
     Variadic<I64>:$strides,
+    Optional<TT_Ptr>:$descPtr,
     DefaultValuedAttr<TT_PaddingOptionAttr, "::mlir::triton::PaddingOption::PAD_ZERO">:$padding
   );
 
   let results = (outs TT_TensorDescType:$result);
 
-  let assemblyFormat = "$base `,` `[` $shape `]` `,` `[` $strides `]` attr-dict `:` type($base) `,` type($result)";
+  let hasCustomAssemblyFormat = 1;
 
   let builders = [
     OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "ArrayRef<int32_t>":$blockShape, "bool":$isSignedInteger,
+    "triton::PaddingOption":$padding)>,
+    OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "Value":$descPtr, "ArrayRef<int32_t>":$blockShape, "bool":$isSignedInteger,
     "triton::PaddingOption":$padding)>
   ];
 
 
@@ -1149,7 +1149,143 @@ void MakeTensorDescOp::build(OpBuilder &builder, OperationState &state,
   auto descTy =
       TensorDescType::get(builder.getContext(), blockTy, isSignedInteger);
   auto paddingAttr = PaddingOptionAttr::get(builder.getContext(), padding);
-  return build(builder, state, descTy, base, shape, strides, paddingAttr);
+  return build(builder, state, descTy, base, shape, strides,
+               /*descPtr=*/Value(), paddingAttr);
+}
+
+void MakeTensorDescOp::build(OpBuilder &builder, OperationState &state,
+                             Value base, ValueRange shape, ValueRange strides,
+                             Value descPtr, ArrayRef<int32_t> blockShape,
+                             bool isSignedInteger,
+                             triton::PaddingOption padding) {
+  auto ptrTy = dyn_cast<triton::PointerType>(base.getType());
+  if (!ptrTy) {
+    llvm::report_fatal_error("Expected pointer type");
+  }
+  auto elemTy = ptrTy.getPointeeType();
+  SmallVector<int64_t> blockShape64(blockShape);
+  auto blockTy = RankedTensorType::get(blockShape64, elemTy);
+  auto descTy =
+      TensorDescType::get(builder.getContext(), blockTy, isSignedInteger);
+  auto paddingAttr = PaddingOptionAttr::get(builder.getContext(), padding);
+  return build(builder, state, descTy, base, shape, strides, descPtr,
+               paddingAttr);
+}
+
+ParseResult MakeTensorDescOp::parse(OpAsmParser &parser,
+                                    OperationState &result) {
+  // Parse: $base `,` `[` $shape `]` `,` `[` $strides `]`
+  //        (`,` `descPtr` `=` $descPtr `:` type($descPtr))?
+  //        attr-dict `:` type($base) `,` type($result)
+
+  OpAsmParser::UnresolvedOperand base;
+  SmallVector<OpAsmParser::UnresolvedOperand> shape;
+  SmallVector<OpAsmParser::UnresolvedOperand> strides;
+  Type baseType, resultType;
+
+  // Parse base operand
+  if (parser.parseOperand(base) || parser.parseComma())
+    return failure();
+
+  // Parse shape: `[` $shape `]`
+  if (parser.parseLSquare() ||
+      parser.parseOperandList(shape, OpAsmParser::Delimiter::None) ||
+      parser.parseRSquare() || parser.parseComma())
+    return failure();
+
+  // Parse strides: `[` $strides `]`
+  if (parser.parseLSquare() ||
+      parser.parseOperandList(strides, OpAsmParser::Delimiter::None) ||
+      parser.parseRSquare())
+    return failure();
+
+  // Optional descPtr
+  OpAsmParser::UnresolvedOperand descPtr;
+  Type descPtrType;
+  bool hasDescPtr = false;
+
+  if (succeeded(parser.parseOptionalComma())) {
+    if (succeeded(parser.parseOptionalKeyword("descPtr"))) {
+      if (parser.parseEqual() || parser.parseOperand(descPtr) ||
+          parser.parseColon() || parser.parseType(descPtrType))
+        return failure();
+      hasDescPtr = true;
+    } else {
+      // If we see a comma but not "descPtr", it's an error
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected 'descPtr' keyword");
+    }
+  }
+
+  // Attr-dict
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  // Parse `:` type($base) `,` type($result)
+  if (parser.parseColon() || parser.parseType(baseType) ||
+      parser.parseComma() || parser.parseType(resultType))
+    return failure();
+
+  // Resolve operands
+  if (parser.resolveOperand(base, baseType, result.operands))
+    return failure();
+
+  // Shape operands are I32
+  auto i32Type = parser.getBuilder().getI32Type();
+  if (parser.resolveOperands(shape, i32Type, result.operands))
+    return failure();
+
+  // Strides operands are I64
+  auto i64Type = parser.getBuilder().getI64Type();
+  if (parser.resolveOperands(strides, i64Type, result.operands))
+    return failure();
+
+  // Resolve optional descPtr
+  if (hasDescPtr) {
+    if (parser.resolveOperand(descPtr, descPtrType, result.operands))
+      return failure();
+  }
+
+  // Tell MLIR how many operands belong to each segment:
+  // [ base, shape..., strides..., descPtr? ]
+  SmallVector<int32_t, 4> segmentSizes;
+  segmentSizes.push_back(1);                  // base
+  segmentSizes.push_back(shape.size());       // shape (Variadic<I32>)
+  segmentSizes.push_back(strides.size());     // strides (Variadic<I64>)
+  segmentSizes.push_back(hasDescPtr ? 1 : 0); // descPtr (Optional<TT_Ptr>)
+
+  auto &builder = parser.getBuilder();
+  result.addAttribute("operand_segment_sizes",
+                      builder.getDenseI32ArrayAttr(segmentSizes));
+
+  // Result type
+  result.addTypes(resultType);
+
+  return success();
+}
+
+void MakeTensorDescOp::print(OpAsmPrinter &p) {
+  // Print: $base `,` `[` $shape `]` `,` `[` $strides `]`
+  //        (`,` `descPtr` `=` $descPtr `:` type($descPtr))?
+  //        attr-dict `:` type($base) `,` type($result)
+
+  p << " " << getBase() << ", [" << getShape() << "], [" << getStrides() << "]";
+
+  // Print descPtr if present
+  if (getDescPtr()) {
+    p << ", descPtr = " << getDescPtr() << " : " << getDescPtr().getType();
+  }
+
+  // Print attributes (excluding any that were explicitly handled)
+  SmallVector<StringRef> elidedAttrs;
+  elidedAttrs.push_back("operandSegmentSizes");
+  // Elide padding if it's the default value
+  if (getPadding() == triton::PaddingOption::PAD_ZERO) {
+    elidedAttrs.push_back("padding");
+  }
+  p.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+  p << " : " << getBase().getType() << ", " << getType();
 }
 
 // The following ops, including `call`, `func`, and `return` are copied and
 
@@ -197,14 +197,24 @@ class TMACreateDescLowering : public OpRewritePattern<MakeTensorDescOp> {
                                 PatternRewriter &rewriter) const override {
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
-    auto alloc = rewriter.create<triton::gpu::GlobalScratchAllocOp>(
-        loc, getPointerType(rewriter.getI8Type()), TMA_SIZE_BYTES, TMA_ALIGN);
-    if (failed(createTMADesc(alloc, op, rewriter))) {
+
+    Value descPtr;
+    // If desc_ptr is provided, use it directly without creating global scratch
+    if (op.getDescPtr()) {
+      descPtr = op.getDescPtr();
+    } else {
+      // Create global scratch allocation when desc_ptr is not provided
+      auto alloc = rewriter.create<triton::gpu::GlobalScratchAllocOp>(
+          loc, getPointerType(rewriter.getI8Type()), TMA_SIZE_BYTES, TMA_ALIGN);
+      descPtr = alloc.getResult();
+    }
+
+    if (failed(createTMADesc(descPtr, op, rewriter))) {
       return failure();
     }
-    rewriter.create<TensormapFenceproxyAcquireOp>(loc, alloc.getResult());
-    auto newDesc = rewriter.create<ReinterpretTensorDescOp>(loc, op.getType(),
-                                                            alloc.getResult());
+    rewriter.create<TensormapFenceproxyAcquireOp>(loc, descPtr);
+    auto newDesc =
+        rewriter.create<ReinterpretTensorDescOp>(loc, op.getType(), descPtr);
     rewriter.replaceOp(op, newDesc);
     return success();
   }
 
@@ -2425,3 +2425,60 @@ def stoch_round_seed_kernel(x_ptr, y_ptr, seed, BLOCK_SIZE: tl.constexpr):
     different_count = (b1.float() != b2.float()).sum().item()
     assert different_count > SIZE * 0.1, (f"Different seeds should produce different results, "
                                           f"but only {different_count}/{SIZE} values differ")
+
+
+@pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
+def test_make_tensor_descriptor(device):
+    """Test global_alloc and make_tensor_descriptor together with TMA operations."""
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == 128
+        assert stream == 0
+        return torch.empty(size, dtype=torch.int8, device=device)
+
+    @triton.jit
+    def kernel(input_ptr, output_ptr, SIZE, BLOCK_SIZE: tl.constexpr):
+        # Allocate descriptor in global scratch memory using global_alloc
+        desc_ptr = tlx.global_alloc(nbytes=256, alignment=128)
+
+        # Create tensor descriptor using the global scratch pointer
+        desc_in = tlx.make_tensor_descriptor(
+            desc_ptr=desc_ptr,
+            base=input_ptr,
+            shape=[SIZE],
+            strides=[tl.constexpr(1)],
+            block_shape=[BLOCK_SIZE],
+        )
+
+        desc_out = tlx.make_tensor_descriptor(
+            desc_ptr=desc_ptr + 128,
+            base=output_ptr,
+            shape=[SIZE],
+            strides=[tl.constexpr(1)],
+            block_shape=[BLOCK_SIZE],
+        )
+
+        # Compute tile offset
+        pid = tl.program_id(0)
+        offset = pid * BLOCK_SIZE
+
+        # Load and store using standard descriptors
+        x = desc_in.load([offset])
+        desc_out.store([offset], x)
+
+    triton.set_allocator(alloc_fn)
+    SIZE = 128
+    BLOCK_SIZE = 64
+    x = torch.ones((SIZE, ), dtype=torch.int16, device=device)
+    y = torch.empty_like(x)
+    grid = lambda meta: (triton.cdiv(SIZE, BLOCK_SIZE), )
+
+    compiled_kernel = kernel[grid](x, y, SIZE, BLOCK_SIZE=BLOCK_SIZE)
+
+    # Check that both global_scratch_alloc and tensormap_create were generated in IR
+    ttgir = compiled_kernel.asm["ttgir"]
+    assert ttgir.count("ttg.global_scratch_alloc") == 1, "Expected 1 global_scratch_alloc operation"
+    assert ttgir.count("ttng.tensormap_create") == 2, "Expected 2 tensormap_create operations"
+
+    # Verify the data was copied correctly through TMA operations
+    torch.testing.assert_close(x, y)
@@ -37,11 +37,11 @@ module attributes {ttg.max_reg_auto_ws = 152 : i32, ttg.min_reg_auto_ws = 24 : i
     %cst_2 = arith.constant dense<0.127517432> : tensor<256xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
     %cst_3 = arith.constant dense<0.000000e+00> : tensor<256x128xf32, #blocked>
     // CHECK-COUNT-8: tt.make_tensor_descriptor
-    %q_desc = tt.make_tensor_descriptor %q, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : <bf16>, <tensor<1x256x128xbf16, #shared>>
-    %k_desc = tt.make_tensor_descriptor %k, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : <bf16>, <tensor<1x128x128xbf16, #shared>>
-    %v_desc = tt.make_tensor_descriptor %v, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : <bf16>, <tensor<1x128x128xbf16, #shared>>
-    %lse_desc_4 = tt.make_tensor_descriptor %lse, [%c128_i32, %c8192_i32], [%lse_desc, %c1_i64] : <f32>, <tensor<1x256xf32, #shared1>>
-    %o_desc = tt.make_tensor_descriptor %o, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : <bf16>, <tensor<1x256x128xbf16, #shared>>
+    %q_desc = tt.make_tensor_descriptor %q, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<1x256x128xbf16, #shared>>
+    %k_desc = tt.make_tensor_descriptor %k, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<1x128x128xbf16, #shared>>
+    %v_desc = tt.make_tensor_descriptor %v, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<1x128x128xbf16, #shared>>
+    %lse_desc_4 = tt.make_tensor_descriptor %lse, [%c128_i32, %c8192_i32], [%lse_desc, %c1_i64] : !tt.ptr<f32>, !tt.tensordesc<tensor<1x256xf32, #shared1>>
+    %o_desc = tt.make_tensor_descriptor %o, [%c128_i32, %c8192_i32, %c128_i32], [%c1048576_i64, %c128_i64, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<1x256x128xbf16, #shared>>
     %0 = tt.get_program_id x : i32
     scf.for %virtual_pid = %0 to %total_pids step %c148_i32  : i32 {
       %pid_0 = arith.remsi %virtual_pid, %c32_i32 : i32
 
@@ -213,7 +213,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %0 = tt.get_program_id x : i32
     %1 = tt.get_program_id y : i32
     %2 = arith.extsi %arg3 : i32 to i64
-    %3 = tt.make_tensor_descriptor %arg0, [%arg2, %arg3], [%2, %c1_i64] : <i16>, <tensor<64x64xsi16>>
+    %3 = tt.make_tensor_descriptor %arg0, [%arg2, %arg3], [%2, %c1_i64] : !tt.ptr<i16>, !tt.tensordesc<tensor<64x64xsi16>>
     // CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x64x64xi16, #[[$SHARED]], #smem, mutable>
     %4 = ttg.local_alloc : () -> !ttg.memdesc<1x64x64xi16, #shared, #smem, mutable>
     %5 = ttg.memdesc_index %4[%c0_i32] : !ttg.memdesc<1x64x64xi16, #shared, #smem, mutable> -> !ttg.memdesc<64x64xi16, #shared, #smem, mutable>
@@ -654,8 +654,8 @@ module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = t
     %10 = arith.muli %arg18, %arg15 : i32
     %11 = arith.muli %arg16, %c128_i32 : i32
     %12 = arith.extsi %11 : i32 to i64
-    %13 = tt.make_tensor_descriptor %arg2, [%10, %11], [%12, %c1_i64] : <bf16>, <tensor<128x128xbf16>>
-    %14 = tt.make_tensor_descriptor %arg4, [%10, %11], [%12, %c1_i64] : <bf16>, <tensor<128x128xbf16>>
+    %13 = tt.make_tensor_descriptor %arg2, [%10, %11], [%12, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<128x128xbf16>>
+    %14 = tt.make_tensor_descriptor %arg4, [%10, %11], [%12, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<128x128xbf16>>
     %15 = ttg.local_alloc : () -> !ttg.memdesc<1x128x128xbf16, #shared, #smem, mutable>
     %16 = ttg.local_alloc : () -> !ttg.memdesc<1x128x128xbf16, #shared, #smem, mutable>
     %17 = ttg.local_alloc : () -> !ttg.memdesc<3x128x128xbf16, #shared, #smem, mutable>
@@ -784,7 +784,7 @@ module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = t
           %result_4 = ttng.tmem_load %76 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked2>
           %77 = tlx.release_layout %result_4 : tensor<128x128xf32, #blocked2> -> tensor<128x128xf32, #blocked3>
           ttng.arrive_barrier %45, 1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable>
-          %78 = tt.make_tensor_descriptor %arg5, [%58, %11], [%12, %c1_i64] : <bf16>, <tensor<128x128xbf16>>
+          %78 = tt.make_tensor_descriptor %arg5, [%58, %11], [%12, %c1_i64] : !tt.ptr<bf16>, !tt.tensordesc<tensor<128x128xbf16>>
           %79 = arith.truncf %77 : tensor<128x128xf32, #blocked3> to tensor<128x128xbf16, #blocked3>
           %80 = arith.addi %56, %71 : i32
           %81 = arith.trunci %70 : i64 to i32
@@ -875,7 +875,7 @@ module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = t
           }
           %76 = arith.muli %arg21, %c128_i32_13 : i32
           %77 = arith.extsi %76 : i32 to i64
-          %78 = tt.make_tensor_descriptor %arg24, [%63, %76], [%77, %c1_i64_6] : <bf16>, <tensor<128x128xbf16>>
+          %78 = tt.make_tensor_descriptor %arg24, [%63, %76], [%77, %c1_i64_6] : !tt.ptr<bf16>, !tt.tensordesc<tensor<128x128xbf16>>
           %79 = ttg.memdesc_index %arg38[%c0_i32_11] : !ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
           %result_14 = ttng.tmem_load %79 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked2>
           %80 = tlx.release_layout %result_14 : tensor<128x128xf32, #blocked2> -> tensor<128x128xf32, #blocked3>
@@ -1071,7 +1071,7 @@ module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = t
         %75:2 = scf.if %74 -> (i32, i32) {
           %77 = arith.muli %arg21, %c128_i32_7 : i32
           %78 = arith.extsi %77 : i32 to i64
-          %79 = tt.make_tensor_descriptor %arg25, [%65, %77], [%78, %c1_i64_6] : <bf16>, <tensor<128x128xbf16>>
+          %79 = tt.make_tensor_descriptor %arg25, [%65, %77], [%78, %c1_i64_6] : !tt.ptr<bf16>, !tt.tensordesc<tensor<128x128xbf16>>
           %80 = arith.andi %arg61, %c1_i32_10 : i32
           %81 = ttg.memdesc_index %arg31[%c0_i32_9] : !ttg.memdesc<1xi64, #shared1, #smem, mutable> -> !ttg.memdesc<1xi64, #shared1, #smem, mutable>
           %82 = arith.xori %80, %c1_i32_10 : i32