Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 190 additions & 10 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}

//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//

class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
: TypeDef<AMDGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
}

//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//

def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let summary = "Pair of base addresses that move data between LDS and global storage.";
let description = [{
Expand All @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let assemblyFormat = "`<` $elementType `>`";
}

def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
let summary = "Descriptors used in tensor store/load operations.";
let description = [{
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];

}

//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1219,17 +1228,15 @@ def AMDGPU_ScaledMFMAOp :
}

def AMDGPU_MakeDmaBaseOp :
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["src", "dst"]>]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to read from">:$src,
Variadic<Index>:$srcIndices,
Variadic<Index>:$src_indices,
Arg<AnyMemRef, "buffer to write to">:$dst,
Variadic<Index>:$dstIndices)>,
Variadic<Index>:$dst_indices)>,
Results<(outs AMDGPU_TDMBaseType: $base)> {

// TODO:
// * Add verifiers such that one of the memrefs is from LDS and the other global.
// * Add verifiers to make sure that the type is in the correct direction.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.

let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
Expand All @@ -1240,11 +1247,184 @@ def AMDGPU_MakeDmaBaseOp :
This operation creates a value corresponding to the tensor descriptor (D#) group 0
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.

For example:

```mlir
%base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```

to

```mlir
// pseudo-code
%global_base = llvm.extractvalue %global_memref[1]
%global_address = llvm.get_element_ptr ...

%lds_base = llvm.extractvalue %lds_memref[1]
%lds_address = llvm.get_element_ptr ...

// Definition of %base
%undef = llvm.mlir.undef : vector<4xi32>
%v0 = llvm.insertelement %15, %undef[0] : vector<4xi32>
%v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32>
%v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32>
%base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32>

rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
```

These tensor DMA operations were introduced in gfx1250.
}];

let assemblyFormat = [{
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
$src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
}];

let hasVerifier = 1;
}

def AMDGPU_MakeDmaDescriptorOp :
AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
Arguments<(ins
AMDGPU_TDMBaseType: $base,
Variadic<Index>: $global_dynamic_sizes,
DenseI64ArrayAttr: $global_static_sizes,
Variadic<Index>: $global_dynamic_strides,
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
Optional<Index>: $pad_amount,
Optional<Index>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
Optional<Index>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {

let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
let description = [{
Make all descriptor groups needed by tensor memory operations.

The $base operand corresponds to the base pair addresses, one must be an address in LDS
while the other must be a global memory location.

$global_{static/dynamic}_sizes determine the size of the tensor.
$global_{static/dynamic}_strides determine the strides of the tensor.
$shared_{static/dynamic}_sizes determines the size of the tile.

Padding can be applied to the LDS address when copying from memory to LDS,
but not when copying from LDS to memory.
The values in the padded target addresses remain the same as before the operation was applied.

2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
$iterate_count determines how many times to iterate.

```mlir
// Example of moving a two-dimensional tensor to LDS.
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor

// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];

let assemblyFormat = [{
$base
`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
( `padShared` `(` $pad_amount^ `every` $pad_interval`)` )?
( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
`:` type($atomic_barrier_address) `)`)?
( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
attr-dict `:` qualified(type($base)) `->` type(results)
}];

let extraClassDeclaration = [{
int getRank() {
return getGlobalStaticSizes().size();
}

int getElementTypeWidth() {
Type elementType = getBase().getType().getElementType();
int width;
if (auto floatType = dyn_cast<FloatType>(elementType)) {
width = floatType.getWidth();
} else if (auto intType = dyn_cast<IntegerType>(elementType)) {
width = intType.getWidth();
} else {
llvm_unreachable("element type must have getWidth interface");
}
return width;
}

SmallVector<OpFoldResult> getMixedList(SmallVector<Value> dynamics, ArrayRef<int64_t> statics) {
SmallVector<OpFoldResult> result;
unsigned ctr = 0;
OpBuilder b(getContext());
for (int64_t static_elem : statics) {
if (ShapedType::isDynamic(static_elem)) {
result.push_back(dynamics[ctr++]);
} else {
result.push_back(b.getIndexAttr(static_elem));
}
}
return result;
}

SmallVector<OpFoldResult> getMixedGlobalSizes() {
return getMixedList(getGlobalDynamicSizes(), getGlobalStaticSizes());
}

SmallVector<OpFoldResult> getMixedGlobalStrides() {
return getMixedList(getGlobalDynamicStrides(), getGlobalStaticStrides());
}

SmallVector<OpFoldResult> getMixedSharedSizes() {
return getMixedList(getSharedDynamicSizes(), getSharedStaticSizes());
}
}];

let hasVerifier = 1;
let hasFolder = 1;
}

def AMDGPU_TensorLoadToLDSOp :
AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
let summary = "Load tensors from global memory to LDS.";
let description = [{
Load tensors of up to five dimensions from global memory to LDS.

The operation is fully described by the descriptor operand.
}];

let assemblyFormat = [{
$desc attr-dict `:` qualified(type($desc))
}];
}

def AMDGPU_TensorStoreFromLDSOp :
AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
let summary = "Store tensors from LDS to global memory.";
let description = [{
Store tensors of up to five dimensions from LDS to global memory.

The operation is fully described by the descriptor operand.
}];

let assemblyFormat = [{
$desc attr-dict `:` qualified(type($desc))
}];
}

Expand Down
5 changes: 5 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
IntegerAttr m, IntegerAttr n, IntegerAttr k) {
printMNKDimensionList(printer, m, n, k);
}

// Utility functions for quering the address space.
bool hasGlobalMemorySpace(Attribute memorySpace);
bool hasWorkgroupMemorySpace(Attribute memorySpace);
bool hasFatRawBufferMemorySpace(Attribute memorySpace);
} // namespace mlir::amdgpu

#define GET_ATTRDEF_CLASSES
Expand Down
Loading