[TLX] Enable TMA desc creation pipelining (#715)

htyu · meta-codesync[bot] · commit 5d1c25f57a32 · 2025-12-03T20:55:56.000-08:00
Summary: This diff adds the `reinterpret_tensor_descriptor` API to TLX for converting raw pointers to TMA tensor descriptors, and fixes the conditional purity modeling of `TT_MakeTensorDescOp`. 1. **New `reinterpret_tensor_descriptor` API** (`mem_ops.py`, `__init__.py`): - Takes a `desc_ptr` (pointer to global memory containing a TMA descriptor), `block_shape`, and `dtype` - Returns a `tl.tensor_descriptor_base` that can be used with TMA operations - Useful when working with pre-allocated descriptor pointers from `tlx.global_alloc` 2. **Fixed conditional purity for `TT_MakeTensorDescOp`** (`TritonOps.td`, `Ops.cpp`): - Removed the `Pure` trait (which cannot be conditional in MLIR ODS) - Implemented `MemoryEffectOpInterface::getEffects()` to conditionally add memory effects - When `descPtr` operand is present, adds `MemoryEffects::Write` to global memory - When `descPtr` is absent, no effects are added (operation is effectively pure) 3. **Added unit test** (`test_tlx.py`): - Tests the new `reinterpret_tensor_descriptor` API with TMA operations - Validates that both `ttg.global_scratch_alloc` and `ttng.reinterpret_tensor_descriptor` operations are generated - Verifies data correctness through TMA load/store operations The `reinterpret_tensor_descriptor` API enables tensor descriptor pipelining patterns where descriptors are allocated in global scratch memory and reused across kernel invocations. This is critical for performance optimization on Hopper/Blackwell GPUs. The conditional purity fix ensures that the MLIR compiler correctly models side effects: when a descriptor pointer is provided, the operation writes to global memory (impure); when auto-allocated, it has no observable side effects (pure). This follows the proper MLIR idiom since ODS traits are compile-time only and cannot be toggled at runtime. Pull Request resolved: #715 Test Plan: Existing tests pass: - `test_reinterpret_tensor_descriptor` validates the new API with TMA operations - Verifies correct MLIR operation generation (global_scratch_alloc, reinterpret_tensor_descriptor) - Confirms data correctness through TMA load/store round-trip The conditional memory effect modeling is validated by the MLIR compiler infrastructure which uses `getEffects()` during optimization passes. Performance: For groupedGEMM with memory-bound shapes such as G=16, M=8192, N=512, K=256 ``` before: x_val preprocessed_aten_grouped_mm-tflops tlx_grouped_gemm-tflops ------- ------------------------------------- ------------------------- 8192 283.609 243.148 after: x_val preprocessed_aten_grouped_mm-tflops tlx_grouped_gemm-tflops ------- ------------------------------------- ------------------------- 8192 283.459 274.755 ``` Reviewed By: manman-ren Differential Revision: D88292292 Pulled By: htyu fbshipit-source-id: ad1a087071166a12670b42511b2f5ffeea220610
diff --git a/README.md b/README.md
@@ -69,12 +69,33 @@ While this approach places more responsibility on the user, it reduces the compi
 
    Store a chunk of data from local memory into global memory buffer. The global address, strides, and buffer size are defined by the memory descriptor.
 
+- `desc_ptrs = tlx.allocate_tensor_descriptor(num)`
+
+   Allocates global memory for tensor descriptor storage with built-in parameters (nbytes=128, alignment=128 per descriptor).
+   Returns a `tensor_descriptor_ptr` with 128-byte stride semantics that supports indexing.
+
+   **Parameters:**
+   - `num`: Number of tensor descriptors to allocate (must be a constexpr)
+
+   **Returns:**
+   - A `tensor_descriptor_ptr` where indexing (e.g., `desc_ptrs[0]`, `desc_ptrs[1]`) advances by 128 bytes per index
+
+   **Example:**
+   ```python
+   # Allocate storage for 4 tensor descriptors
+   desc_ptrs = tlx.allocate_tensor_descriptor(num=4)
+
+   # Access individual descriptors using indexing
+   desc_ptr_0 = desc_ptrs[0]  # First descriptor
+   desc_ptr_1 = desc_ptrs[1]  # Second descriptor (128 bytes offset)
+   ```
+
 - `tlx.make_tensor_descriptor(desc_ptr, base, shape, strides, block_shape, padding_option)`
 
    Create a TMA (Tensor Memory Accelerator) descriptor for efficient asynchronous data movement on Hopper and Blackwell GPUs.
 
    **Parameters:**
-   - `desc_ptr` (optional): Pointer to global memory for descriptor storage. Pass `None` for automatic allocation.
+   - `desc_ptr` (optional): Tensor descriptor pointer from `allocate_tensor_descriptor()`. Pass `None` for automatic allocation.
    - `base`: Base pointer to the tensor in global memory
    - `shape`: List of tensor dimensions (dynamic, runtime values)
    - `strides`: List of tensor strides (dynamic, runtime values)
@@ -92,15 +113,47 @@ While this approach places more responsibility on the user, it reduces the compi
        block_shape=[64, 64],
    )
 
-   # Or with explicit scratch allocation for advanced use cases
-   desc_ptr = tlx.global_alloc(nbytes=128, alignment=128)
-   desc = tlx.make_tensor_descriptor(
-       desc_ptr=desc_ptr,
+   # Or with explicit descriptor allocation for advanced use cases (e.g., pipelining)
+   desc_ptrs = tlx.allocate_tensor_descriptor(num=2)
+
+   # Create descriptor at index 0
+   tlx.make_tensor_descriptor(
+       desc_ptr=desc_ptrs[0],
        base=tensor_ptr,
        shape=[M, N],
        strides=[N, tl.constexpr(1)],
        block_shape=[64, 64],
    )
+
+   # Reinterpret the descriptor for TMA operations
+   desc = tlx.reinterpret_tensor_descriptor(
+       desc_ptr=desc_ptrs[0],
+       block_shape=[64, 64],
+       dtype=tl.float16,
+   )
+
+   # Use with async TMA operations
+   tlx.async_descriptor_load(desc, buffer, offsets=[m_offset, n_offset], barrier=mbar)
+   ```
+
+- `desc = tlx.reinterpret_tensor_descriptor(desc_ptr, block_shape, dtype)`
+
+   Reinterpret a tensor descriptor pointer as a TMA-backed tensor descriptor object.
+
+   **Parameters:**
+   - `desc_ptr`: A `tensor_descriptor_ptr` pointing to the TMA descriptor (from `allocate_tensor_descriptor`)
+   - `block_shape`: Shape of the block to be loaded/stored (compile-time constants)
+   - `dtype`: Data type of the tensor elements
+
+   **Example:**
+   ```python
+   # Allocate and create descriptor
+   desc_ptrs = tlx.allocate_tensor_descriptor(num=2)
+   tlx.make_tensor_descriptor(desc_ptr=desc_ptrs[0], base=a_ptr, shape=[M, K], strides=[K, 1], block_shape=[128, 64])
+
+   # Reinterpret for use with TMA
+   a_desc = tlx.reinterpret_tensor_descriptor(desc_ptr=desc_ptrs[0], block_shape=[128, 64], dtype=tl.float16)
+   tlx.async_descriptor_load(a_desc, buffer, offsets=[offs_m, offs_k], barrier=mbar)
    ```
 
 - `tlx.async_load(tensor_ptr, buffer, optional_mask, optional_other, cache_modifier, eviction_policy, is_volatile)`
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -1053,8 +1053,8 @@ def TT_MakeTensorPtrOp : TT_Op<"make_tensor_ptr",
 // Make Tensor Descriptor Op
 //
 def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor", [
-    Pure,
     AttrSizedOperandSegments,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
 ]> {
   let summary = "Make a tensor descriptor type with meta information of the parent tensor and block size";
 
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1288,6 +1288,16 @@ void MakeTensorDescOp::print(OpAsmPrinter &p) {
   p << " : " << getBase().getType() << ", " << getType();
 }
 
+void MakeTensorDescOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  // If descPtr operand is present, this operation writes to global memory
+  if (getDescPtr()) {
+    effects.emplace_back(MemoryEffects::Write::get(), GlobalMemory::get());
+  }
+  // Otherwise, the operation is pure (no effects)
+}
+
 // The following ops, including `call`, `func`, and `return` are copied and
 // modified from
 // https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/Func/IR/FuncOps.cpp
diff --git a/python/test/unit/language/test_tlx.py b/python/test/unit/language/test_tlx.py
@@ -2634,7 +2634,7 @@ def stoch_round_seed_kernel(x_ptr, y_ptr, seed, BLOCK_SIZE: tl.constexpr):
 
 @pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
 def test_make_tensor_descriptor(device):
-    """Test global_alloc and make_tensor_descriptor together with TMA operations."""
+    """Test allocate_tensor_descriptor and make_tensor_descriptor together with TMA operations."""
 
     def alloc_fn(size: int, align: int, stream: Optional[int]):
         assert align == 128
@@ -2643,20 +2643,20 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 
     @triton.jit
     def kernel(input_ptr, output_ptr, SIZE, BLOCK_SIZE: tl.constexpr):
-        # Allocate descriptor in global scratch memory using global_alloc
-        desc_ptr = tlx.global_alloc(nbytes=256, alignment=128)
+        # Allocate descriptor in global scratch memory using allocate_tensor_descriptor
+        desc_ptrs = tlx.allocate_tensor_descriptor(num=2)
 
         # Create tensor descriptor using the global scratch pointer
-        desc_in = tlx.make_tensor_descriptor(
-            desc_ptr=desc_ptr,
+        tlx.make_tensor_descriptor(
+            desc_ptr=desc_ptrs[0],
             base=input_ptr,
             shape=[SIZE],
             strides=[tl.constexpr(1)],
             block_shape=[BLOCK_SIZE],
         )
 
-        desc_out = tlx.make_tensor_descriptor(
-            desc_ptr=desc_ptr + 128,
+        tlx.make_tensor_descriptor(
+            desc_ptr=desc_ptrs[1],
             base=output_ptr,
             shape=[SIZE],
             strides=[tl.constexpr(1)],
@@ -2668,6 +2668,17 @@ def kernel(input_ptr, output_ptr, SIZE, BLOCK_SIZE: tl.constexpr):
         offset = pid * BLOCK_SIZE
 
         # Load and store using standard descriptors
+        # Reinterpret pointers as tensor descriptors
+        desc_in = tlx.reinterpret_tensor_descriptor(
+            desc_ptr=desc_ptrs[0],
+            block_shape=[BLOCK_SIZE],
+            dtype=tlx.dtype_of(input_ptr),
+        )
+        desc_out = tlx.reinterpret_tensor_descriptor(
+            desc_ptr=desc_ptrs[1],
+            block_shape=[BLOCK_SIZE],
+            dtype=tlx.dtype_of(output_ptr),
+        )
         x = desc_in.load([offset])
         desc_out.store([offset], x)
 
@@ -2684,6 +2695,7 @@ def kernel(input_ptr, output_ptr, SIZE, BLOCK_SIZE: tl.constexpr):
     ttgir = compiled_kernel.asm["ttgir"]
     assert ttgir.count("ttg.global_scratch_alloc") == 1, "Expected 1 global_scratch_alloc operation"
     assert ttgir.count("ttng.tensormap_create") == 2, "Expected 2 tensormap_create operations"
+    assert ttgir.count("ttng.reinterpret_tensor_descriptor") == 2, "Expected 2 reinterpret_tensor_descriptor operations"
 
     # Verify the data was copied correctly through TMA operations
     torch.testing.assert_close(x, y)
diff --git a/test/TritonNvidiaGPU/tma_lowering.mlir b/test/TritonNvidiaGPU/tma_lowering.mlir
@@ -103,8 +103,60 @@ tt.func @tma_scatter(%arg0: !tt.tensordesc<tensor<1x128xbf16, #nvmma_128>>, %arg
   // CHECK-NEXT: ttng.async_tma_store_wait
   tt.descriptor_scatter %arg0[%arg1, %arg2], %arg3 : !tt.tensordesc<tensor<1x128xbf16, #nvmma_128>>, tensor<32xi32, #blocked>, i32, tensor<32x128xbf16, #blocked1>
   tt.return
+  }
+
 }
 
+// -----
+
+#nvmma_32 = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 8}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // Test that MakeTensorDescOp without descPtr has no memory effects (pure)
+  // This enables CSE - duplicate operations with identical inputs can be eliminated
+  // CHECK-LABEL: make_tensor_descriptor_pure
+  tt.func public @make_tensor_descriptor_pure(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) -> !tt.tensordesc<tensor<8x32xi8, #nvmma_32>> {
+    %c1_i64 = arith.constant 1 : i64
+    %0 = arith.extsi %arg2 : i32 to i64
+    // Without descPtr, the operation has no observable side effects
+    // Both calls have identical inputs, so CSE should eliminate one
+    %1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : !tt.ptr<i8>, !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>
+    %2 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : !tt.ptr<i8>, !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>
+    // CHECK: %[[ALLOC:.*]] = ttg.global_scratch_alloc
+    // CHECK: ttng.tensormap_create %[[ALLOC]]
+    // CHECK: ttng.tensormap_fenceproxy_acquire %[[ALLOC]]
+    // CHECK: %[[DESC:.*]] = ttng.reinterpret_tensor_descriptor %[[ALLOC]]
+    // CHECK-NOT: ttg.global_scratch_alloc
+    // CHECK-NOT: ttng.tensormap_create
+    // Both operations should be CSE'd into a single descriptor due to purity
+    tt.return %1 : !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>
+  }
+}
+
+// -----
+
+#nvmma_32 = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 8}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // Test that MakeTensorDescOp with descPtr has memory effects (impure)
+  // This prevents CSE - operations writing to different locations must be preserved
+  // CHECK-LABEL: make_tensor_descriptor_impure
+  tt.func public @make_tensor_descriptor_impure(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<i8> {tt.divisibility = 16 : i32}) -> (!tt.tensordesc<tensor<8x32xi8, #nvmma_32>>, !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>) {
+    %c1_i64 = arith.constant 1 : i64
+    %0 = arith.extsi %arg2 : i32 to i64
+    // With descPtr, the operation writes to global memory (impure)
+    // Both operations write to different locations (arg3 vs arg4), so both must be preserved
+    %1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64], descPtr = %arg3 : !tt.ptr<i8> : !tt.ptr<i8>, !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>
+    %2 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64], descPtr = %arg4 : !tt.ptr<i8> : !tt.ptr<i8>, !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>
+    // CHECK: ttng.tensormap_create %arg3
+    // CHECK: ttng.tensormap_fenceproxy_acquire %arg3
+    // CHECK: %[[DESC1:.*]] = ttng.reinterpret_tensor_descriptor %arg3
+    // CHECK: ttng.tensormap_create %arg4
+    // CHECK: ttng.tensormap_fenceproxy_acquire %arg4
+    // CHECK: %[[DESC2:.*]] = ttng.reinterpret_tensor_descriptor %arg4
+    // Both operations must be preserved (no CSE) due to impurity
+    tt.return %1, %2 : !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>, !tt.tensordesc<tensor<8x32xi8, #nvmma_32>>
+  }
 }
 
 // -----
diff --git a/third_party/tlx/language/tlx/__init__.py b/third_party/tlx/language/tlx/__init__.py
@@ -14,6 +14,8 @@
     clc_response_type,
     CLCPipelineContext,
     async_token,
+    tensor_descriptor_ptr,
+    tensor_descriptor_ptr_type,
 )
 from .mem_ops import (
     local_alloc,
@@ -28,12 +30,13 @@
     local_store,
     local_trans,
     local_reinterpret,
-    global_alloc,
+    allocate_tensor_descriptor,
     async_descriptor_load,
     async_descriptor_store,
     async_descriptor_store_wait,
     fence_async_shared,
     make_tensor_descriptor,
+    reinterpret_tensor_descriptor,
 )
 from .barrier import (
     alloc_barriers,
@@ -88,6 +91,8 @@
     "clc_response_type",
     "CLCPipeliner",
     "async_token",
+    "tensor_descriptor_ptr",
+    "tensor_descriptor_ptr_type",
     # mem_ops
     "local_alloc",
     "local_view",
@@ -101,12 +106,13 @@
     "local_store",
     "local_trans",
     "local_reinterpret",
-    "global_alloc",
+    "allocate_tensor_descriptor",
     "async_descriptor_load",
     "async_descriptor_store",
     "async_descriptor_store_wait",
     "fence_async_shared",
     "make_tensor_descriptor",
+    "reinterpret_tensor_descriptor",
     # barriers
     "alloc_barriers",
     "barrier_expect_bytes",
diff --git a/third_party/tlx/language/tlx/mem_ops.py b/third_party/tlx/language/tlx/mem_ops.py
diff --git a/third_party/tlx/language/tlx/types.py b/third_party/tlx/language/tlx/types.py
diff --git a/third_party/tlx/tutorials/blackwell-grouped-gemm_test.py b/third_party/tlx/tutorials/blackwell-grouped-gemm_test.py