Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a4a1a59
[mlir][amdgpu] Add make_dma_base operation
amd-eochoalo Nov 21, 2025
d14f3e2
Remove MemRead and MemWrite from operation
amd-eochoalo Nov 24, 2025
d3ca18c
Add Pure to make_dma_base
amd-eochoalo Nov 24, 2025
76e47f1
Add DynamicIndexList
amd-eochoalo Nov 24, 2025
f6f67e3
Add globalStride
amd-eochoalo Nov 24, 2025
1e2668c
Add verifier for innermost dimension
amd-eochoalo Nov 24, 2025
f1df3c5
Add sharedSize
amd-eochoalo Nov 24, 2025
a24a840
Add optional atomic barrier
amd-eochoalo Nov 24, 2025
ccaf771
Add iterate
amd-eochoalo Nov 24, 2025
566d2e6
[mlir][amdgpu] Add make_dma_descriptor.
amd-eochoalo Nov 24, 2025
2be4ccc
Fix indentation
amd-eochoalo Nov 24, 2025
b3ba450
Review
amd-eochoalo Nov 24, 2025
d34c423
Fix parser
amd-eochoalo Nov 24, 2025
cfb20cc
whitespace
amd-eochoalo Nov 24, 2025
5e98ed0
Fix parser
amd-eochoalo Nov 24, 2025
5cca5f9
check if it is not empty
amd-eochoalo Nov 25, 2025
0f913f5
less variables
amd-eochoalo Nov 25, 2025
adcbc32
mlir example
amd-eochoalo Nov 25, 2025
61fd94d
MLIR examples
amd-eochoalo Nov 25, 2025
3de0f3c
Use custom<DynamicIndexList> for indices.
amd-eochoalo Nov 25, 2025
0a70e24
Update mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
amd-eochoalo Nov 25, 2025
ec58b7c
Merge branch 'main' into eochoa/2025-11-24/amdgpu-make-dma-descriptor
amd-eochoalo Nov 26, 2025
29072b8
Remove OptionalAttr from static indices
amd-eochoalo Nov 26, 2025
50e76d4
Atomic barrier only takes Variadic<Index>
amd-eochoalo Nov 26, 2025
b339c7a
Do not use attribute splitting
amd-eochoalo Nov 26, 2025
fb82ac3
Rename $every to $pad_every.
amd-eochoalo Nov 26, 2025
445f96e
Remove unused functions
amd-eochoalo Nov 26, 2025
e022322
Only use dynamic indices in make_dma_base
amd-eochoalo Nov 26, 2025
850d6d0
Add verification for same rank
amd-eochoalo Nov 26, 2025
a8fbe1a
Verify tile and tensor's rank are the same
amd-eochoalo Nov 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 109 additions & 8 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}

//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//

class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
: TypeDef<AMDGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
}

//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//

def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let summary = "Pair of base addresses that move data between LDS and global storage.";
let description = [{
Expand All @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let assemblyFormat = "`<` $elementType `>`";
}

def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
let summary = "Descriptors used in tensor store/load operations.";
let description = [{
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];

}

//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp :
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to read from">:$src,
Variadic<Index>:$srcIndices,
Variadic<Index>:$src_indices,
Arg<AnyMemRef, "buffer to write to">:$dst,
Variadic<Index>:$dstIndices)>,
Variadic<Index>:$dst_indices)>,
Results<(outs AMDGPU_TDMBaseType: $base)> {

// TODO:
// * Add verifiers such that one of the memrefs is from LDS and the other global.
// * Add verifiers to make sure that the type is in the correct direction.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.

let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
Expand All @@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp :
This operation creates a value corresponding to the tensor descriptor (D#) group 0
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.

For example:

```mlir
%base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```

to

```mlir
// pseudocode
%base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)>
%base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)>
%base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)>
// type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base<i32>

// The base will be used when contructing dgroup0
// when lowering amdgpu.make_dma_descriptor
%dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)>
%dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : ....

// When lowering amdgpu.tensor_load_to_lds
rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
```

These tensor DMA operations were introduced in gfx1250.
}];

let assemblyFormat = [{
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
$src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
}];
}

def AMDGPU_MakeDmaDescriptorOp :
AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
Arguments<(ins
AMDGPU_TDMBaseType: $base,
Variadic<Index>: $global_dynamic_sizes,
DenseI64ArrayAttr: $global_static_sizes,
Variadic<Index>: $global_dynamic_strides,
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
Optional<Index>: $pad,
Optional<Index>: $pad_every,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
Optional<Index>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {

let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
let description = [{
Make all descriptor groups needed by tensor memory operations.

The $base operand corresponds to the base pair addresses, one must be an address in LDS
while the other must be a global memory location.

$global_{static/dynamic}_sizes determine the size of the tensor.
$global_{static/dynamic}_strides determine the strides of the tensor.
$shared_{static/dynamic}_sizes determines the size of the tile.

Padding can be applied to the LDS address when copying from memory to LDS,
but not when copying from LDS to memory.
The values in the padded target addresses remain the same as before the operation was applied.

2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
$iterate_count determines how many times to iterate.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also here: can you add a few mlir examples?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a few examples, I will ask for clarification on iteration.


```mlir
// Example of moving a two-dimensional tensor to LDS.
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor

// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];

let assemblyFormat = [{
$base
`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
( `padShared` `(` $pad^ `every` $pad_every `)` )?
( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
`:` type($atomic_barrier_address) `)`)?
( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
attr-dict `:` qualified(type($base)) `->` type(results)
}];

let hasVerifier = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll want the OpFoldResult helpers for combining the static/dynamic parts, I claim.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point here but let's leave that for the next PR which will add the lowering.

}

#endif // AMDGPU
28 changes: 28 additions & 0 deletions mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,34 @@ LogicalResult TransposeLoadOp::verify() {
return success();
}

//===----------------------------------------------------------------------===//
// MakeDmaDescriptorOp
//===----------------------------------------------------------------------===//

LogicalResult MakeDmaDescriptorOp::verify() {
ArrayRef<int64_t> globalStaticStrides = getGlobalStaticStrides();

if (globalStaticStrides.empty()) {
return emitOpError("strides must not be empty.");
}
if (globalStaticStrides.back() != 1) {
return emitOpError("strides for the innermost dimension must be 1.");
}

ArrayRef<int64_t> globalStaticSizes = getGlobalStaticSizes();
size_t rank = globalStaticSizes.size();
if (rank != globalStaticStrides.size()) {
return emitOpError("strides and sizes must have same rank.");
}

ArrayRef<int64_t> sharedStaticSizes = getSharedStaticSizes();
if (rank != sharedStaticSizes.size()) {
return emitOpError("tensor must have same rank as tile.");
}

return success();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing validation conditions: Number of strides == number of sizes == number of offsets, and the global and LDS tiles have the same dimensions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

//===----------------------------------------------------------------------===//
// ScaledMFMAOp
//===----------------------------------------------------------------------===//
Expand Down
40 changes: 40 additions & 0 deletions mlir/test/Dialect/AMDGPU/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -354,3 +354,43 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x
%0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
func.return %0 : vector<16xf32>
}

// -----

// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32>) {
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}}
amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}

// -----

// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<i32>) {
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}

// -----

// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base<i32>) {
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}}
amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}

// -----

// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base<i32>) {
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}}
amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}
59 changes: 55 additions & 4 deletions mlir/test/Dialect/AMDGPU/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -689,11 +689,62 @@ func.func @memory_counter_wait() {
// CHECK-LABEL: func @make_dma_base
// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space<workgroup>>)
func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space<workgroup>>) {
// CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
// CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>

// CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
// CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
func.return
}

// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index)
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {

// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
// CHECK-SAME: globalSize [0]
globalSize [0]
// CHECK-SAME: globalStride [1]
globalStride [1]
// CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor

// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
// CHECK-SAME: globalSize [0]
globalSize [0]
// CHECK-SAME: globalStride [1]
globalStride [1]
// CHECK-SAME: sharedSize [0]
sharedSize [0]
// CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
padShared(%idx every %idx)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor

// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
// CHECK-SAME: globalSize [0]
globalSize [0]
// CHECK-SAME: globalStride [1]
globalStride [1]
// CHECK-SAME: sharedSize [0]
sharedSize [0]
// CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>)
atomicBarrier(%barrier[%idx] : memref<8xi32>)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor

// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
// CHECK-SAME: globalSize [0]
globalSize [0]
// CHECK-SAME: globalStride [1]
globalStride [1]
// CHECK-SAME: sharedSize [0]
sharedSize [0]
// CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
iterate %idx, %idx, %idx
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor

func.return
}