diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index e07c72b839e7c..3581b07dc4e3e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr traits = []> : TypeDef { let mnemonic = typeMnemonic; } -//===----------------------------------------------------------------------===// -// AMDGPU Type definitions -//===----------------------------------------------------------------------===// - def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let summary = "Pair of base addresses that move data between LDS and global storage."; let description = [{ @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let assemblyFormat = "`<` $elementType `>`"; } +def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> { + let summary = "Descriptors used in tensor store/load operations."; + let description = [{ + This type is opaque and corresponds to the two or four descriptor groups + used in tensor_load_to_lds or tensor_store_from_lds. + }]; + +} + //===----------------------------------------------------------------------===// // AMDGPU Op definitions //===----------------------------------------------------------------------===// @@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp : AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, - Variadic:$srcIndices, + Variadic:$src_indices, Arg:$dst, - Variadic:$dstIndices)>, + Variadic:$dst_indices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: // * Add verifiers such that one of the memrefs is from LDS and the other global. - // * Add verifiers to make sure that the type is in the correct direction. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp : This operation creates a value corresponding to the tensor descriptor (D#) group 0 found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect. + For example: + + ```mlir + %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + + to + + ```mlir + // pseudocode + %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> + %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> + %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> + // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base + + // The base will be used when contructing dgroup0 + // when lowering amdgpu.make_dma_descriptor + %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> + %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... + + // When lowering amdgpu.tensor_load_to_lds + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + ``` + These tensor DMA operations were introduced in gfx1250. }]; let assemblyFormat = [{ - $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results) + $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) + }]; +} + +def AMDGPU_MakeDmaDescriptorOp : + AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>, + Arguments<(ins + AMDGPU_TDMBaseType: $base, + Variadic: $global_dynamic_sizes, + DenseI64ArrayAttr: $global_static_sizes, + Variadic: $global_dynamic_strides, + DenseI64ArrayAttr: $global_static_strides, + Variadic: $shared_dynamic_sizes, + DenseI64ArrayAttr: $shared_static_sizes, + Optional: $pad, + Optional: $pad_every, + Optional: $atomic_barrier_address, + Variadic: $atomic_barrier_indices, + Optional: $global_increment, + Optional: $lds_increment, + Optional: $iteration_count)>, + Results<(outs AMDGPU_TDMDescriptorType: $desc)> { + + let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS."; + let description = [{ + Make all descriptor groups needed by tensor memory operations. + + The $base operand corresponds to the base pair addresses, one must be an address in LDS + while the other must be a global memory location. + + $global_{static/dynamic}_sizes determine the size of the tensor. + $global_{static/dynamic}_strides determine the strides of the tensor. + $shared_{static/dynamic}_sizes determines the size of the tile. + + Padding can be applied to the LDS address when copying from memory to LDS, + but not when copying from LDS to memory. + The values in the padded target addresses remain the same as before the operation was applied. + + 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. + $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. + $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. + $iterate_count determines how many times to iterate. + + ```mlir + // Example of moving a two-dimensional tensor to LDS. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + + // Example of moving a two dimension tensor to LDS where padding is applied after every integer. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + }]; + + let assemblyFormat = [{ + $base + `globalSize` custom($global_dynamic_sizes, $global_static_sizes) + `globalStride` custom($global_dynamic_strides, $global_static_strides) + `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) + ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` + `:` type($atomic_barrier_address) `)`)? + ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? + attr-dict `:` qualified(type($base)) `->` type(results) }]; + + let hasVerifier = 1; } #endif // AMDGPU diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index cdc10c60a42ae..5ff640b5d1596 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -705,6 +705,34 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaDescriptorOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaDescriptorOp::verify() { + ArrayRef globalStaticStrides = getGlobalStaticStrides(); + + if (globalStaticStrides.empty()) { + return emitOpError("strides must not be empty."); + } + if (globalStaticStrides.back() != 1) { + return emitOpError("strides for the innermost dimension must be 1."); + } + + ArrayRef globalStaticSizes = getGlobalStaticSizes(); + size_t rank = globalStaticSizes.size(); + if (rank != globalStaticStrides.size()) { + return emitOpError("strides and sizes must have same rank."); + } + + ArrayRef sharedStaticSizes = getSharedStaticSizes(); + if (rank != sharedStaticSizes.size()) { + return emitOpError("tensor must have same rank as tile."); + } + + return success(); +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 61fdf29a78cbd..066f46060f62f 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -354,3 +354,43 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> func.return %0 : vector<16xf32> } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} + amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 653f9f64d24f4..a8af06dc5ff0a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,11 +689,62 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base - amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base - amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) + padShared(%idx every %idx) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) + atomicBarrier(%barrier[%idx] : memref<8xi32>) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + func.return +}