From b766215221d453a97002d6faabc1c387dac3f10b Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 16:42:44 -0500 Subject: [PATCH 1/9] [mlir][amdgpu] Add make_dma_descriptor op --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 117 ++++++++++++++++-- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 28 +++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 40 ++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 59 ++++++++- 4 files changed, 232 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index e07c72b839e7c..3581b07dc4e3e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr traits = []> : TypeDef { let mnemonic = typeMnemonic; } -//===----------------------------------------------------------------------===// -// AMDGPU Type definitions -//===----------------------------------------------------------------------===// - def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let summary = "Pair of base addresses that move data between LDS and global storage."; let description = [{ @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let assemblyFormat = "`<` $elementType `>`"; } +def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> { + let summary = "Descriptors used in tensor store/load operations."; + let description = [{ + This type is opaque and corresponds to the two or four descriptor groups + used in tensor_load_to_lds or tensor_store_from_lds. + }]; + +} + //===----------------------------------------------------------------------===// // AMDGPU Op definitions //===----------------------------------------------------------------------===// @@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp : AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, - Variadic:$srcIndices, + Variadic:$src_indices, Arg:$dst, - Variadic:$dstIndices)>, + Variadic:$dst_indices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: // * Add verifiers such that one of the memrefs is from LDS and the other global. - // * Add verifiers to make sure that the type is in the correct direction. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp : This operation creates a value corresponding to the tensor descriptor (D#) group 0 found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect. + For example: + + ```mlir + %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + + to + + ```mlir + // pseudocode + %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> + %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> + %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> + // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base + + // The base will be used when contructing dgroup0 + // when lowering amdgpu.make_dma_descriptor + %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> + %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... + + // When lowering amdgpu.tensor_load_to_lds + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + ``` + These tensor DMA operations were introduced in gfx1250. }]; let assemblyFormat = [{ - $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results) + $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) + }]; +} + +def AMDGPU_MakeDmaDescriptorOp : + AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>, + Arguments<(ins + AMDGPU_TDMBaseType: $base, + Variadic: $global_dynamic_sizes, + DenseI64ArrayAttr: $global_static_sizes, + Variadic: $global_dynamic_strides, + DenseI64ArrayAttr: $global_static_strides, + Variadic: $shared_dynamic_sizes, + DenseI64ArrayAttr: $shared_static_sizes, + Optional: $pad, + Optional: $pad_every, + Optional: $atomic_barrier_address, + Variadic: $atomic_barrier_indices, + Optional: $global_increment, + Optional: $lds_increment, + Optional: $iteration_count)>, + Results<(outs AMDGPU_TDMDescriptorType: $desc)> { + + let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS."; + let description = [{ + Make all descriptor groups needed by tensor memory operations. + + The $base operand corresponds to the base pair addresses, one must be an address in LDS + while the other must be a global memory location. + + $global_{static/dynamic}_sizes determine the size of the tensor. + $global_{static/dynamic}_strides determine the strides of the tensor. + $shared_{static/dynamic}_sizes determines the size of the tile. + + Padding can be applied to the LDS address when copying from memory to LDS, + but not when copying from LDS to memory. + The values in the padded target addresses remain the same as before the operation was applied. + + 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. + $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. + $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. + $iterate_count determines how many times to iterate. + + ```mlir + // Example of moving a two-dimensional tensor to LDS. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + + // Example of moving a two dimension tensor to LDS where padding is applied after every integer. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + }]; + + let assemblyFormat = [{ + $base + `globalSize` custom($global_dynamic_sizes, $global_static_sizes) + `globalStride` custom($global_dynamic_strides, $global_static_strides) + `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) + ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` + `:` type($atomic_barrier_address) `)`)? + ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? + attr-dict `:` qualified(type($base)) `->` type(results) }]; + + let hasVerifier = 1; } #endif // AMDGPU diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index cdc10c60a42ae..5ff640b5d1596 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -705,6 +705,34 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaDescriptorOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaDescriptorOp::verify() { + ArrayRef globalStaticStrides = getGlobalStaticStrides(); + + if (globalStaticStrides.empty()) { + return emitOpError("strides must not be empty."); + } + if (globalStaticStrides.back() != 1) { + return emitOpError("strides for the innermost dimension must be 1."); + } + + ArrayRef globalStaticSizes = getGlobalStaticSizes(); + size_t rank = globalStaticSizes.size(); + if (rank != globalStaticStrides.size()) { + return emitOpError("strides and sizes must have same rank."); + } + + ArrayRef sharedStaticSizes = getSharedStaticSizes(); + if (rank != sharedStaticSizes.size()) { + return emitOpError("tensor must have same rank as tile."); + } + + return success(); +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 61fdf29a78cbd..066f46060f62f 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -354,3 +354,43 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> func.return %0 : vector<16xf32> } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} + amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 653f9f64d24f4..a8af06dc5ff0a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,11 +689,62 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base - amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base - amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) + padShared(%idx every %idx) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) + atomicBarrier(%barrier[%idx] : memref<8xi32>) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + func.return +} From 3c31d68dab4254b01b727bee1bd15e4a2c6fc15e Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 11:28:04 -0500 Subject: [PATCH 2/9] [mlir][amdgpu] Add tensor load store operation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 30 +++++++++++++++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++++ 2 files changed, 40 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 3581b07dc4e3e..12ef5337296a2 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1349,4 +1349,34 @@ def AMDGPU_MakeDmaDescriptorOp : let hasVerifier = 1; } +def AMDGPU_TensorLoadToLDSOp : + AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Load tensors from global memory to LDS."; + let description = [{ + Load tensors of up to five dimensions from global memory to LDS. + + The operation is fully described by the descriptor operand. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + +def AMDGPU_TensorStoreFromLDSOp : + AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Store tensors from LDS to global memory."; + let description = [{ + Store tensors of up to five dimensions from LDS to global memory. + + The operation is fully described by the descriptor operand. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + #endif // AMDGPU diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index a8af06dc5ff0a..aa6bedc0e1135 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -748,3 +748,13 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x func.return } + +// CHECK-LABEL: @tensor_load_store +// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor) +func.func @tensor_load_store(%desc: !amdgpu.tdm_descriptor) { + // CHECK: amdgpu.tensor_load_to_lds %[[DESC]] + amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor + // CHECK: amdgpu.tensor_store_from_lds %[[DESC]] + amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor + return +} From cb116ea0b0444eb72c26e38bdb6572cdeef97e61 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 14:24:50 -0500 Subject: [PATCH 3/9] [mlir][amdgpu] Lower amdgpu.make_dma_base. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 3 +- .../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 5 ++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 77 +++++++++++++++++- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 79 ++++++++++++------- ...cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} | 74 +++++++++++++++++ 5 files changed, 206 insertions(+), 32 deletions(-) rename mlir/test/Conversion/AMDGPUToROCDL/{cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} (73%) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 12ef5337296a2..9cb0752fba48b 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1237,7 +1237,6 @@ def AMDGPU_MakeDmaBaseOp : Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: - // * Add verifiers such that one of the memrefs is from LDS and the other global. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1280,6 +1279,8 @@ def AMDGPU_MakeDmaBaseOp : let assemblyFormat = [{ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) }]; + + let hasVerifier = 1; } def AMDGPU_MakeDmaDescriptorOp : diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h index a7680fb5c3191..958757da0933e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *, IntegerAttr m, IntegerAttr n, IntegerAttr k) { printMNKDimensionList(printer, m, n, k); } + +// Utility functions for quering the address space. +bool hasGlobalMemorySpace(Attribute memorySpace); +bool hasWorkgroupMemorySpace(Attribute memorySpace); +bool hasFatRawBufferMemorySpace(Attribute memorySpace); } // namespace mlir::amdgpu #define GET_ATTRDEF_CLASSES diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index b9a5e7d7f6eac..3316e16a05d5c 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2264,6 +2264,76 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern { } }; +struct AMDGPUMakeDmaBaseLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + AMDGPUMakeDmaBaseLowering(const LLVMTypeConverter &converter, Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + Chipset chipset; + + LogicalResult + matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) + return op->emitOpError("make_dma_base is only supported on gfx1250"); + + Location loc = op.getLoc(); + + ValueRange srcIndices = adaptor.getSrcIndices(); + Value src = adaptor.getSrc(); + auto srcMemRefType = cast(op.getSrc().getType()); + + Value srcPtr = + getStridedElementPtr(rewriter, loc, srcMemRefType, src, srcIndices); + + ValueRange dstIndices = adaptor.getDstIndices(); + Value dst = adaptor.getDst(); + auto dstMemRefType = cast(op.getDst().getType()); + + Value dstPtr = + getStridedElementPtr(rewriter, loc, dstMemRefType, dst, dstIndices); + + bool storeFrom = hasWorkgroupMemorySpace(srcMemRefType.getMemorySpace()); + Value ldsAddr = storeFrom ? srcPtr : dstPtr; + Value globalAddr = storeFrom ? dstPtr : srcPtr; + + Type i32 = rewriter.getI32Type(); + Type i64 = rewriter.getI64Type(); + + Value castForLdsAddr = + LLVM::PtrToIntOp::create(rewriter, loc, i32, ldsAddr); + Value castForGlobalAddr = + LLVM::PtrToIntOp::create(rewriter, loc, i64, globalAddr); + + Value mask = createI64Constant(rewriter, loc, 0x1FFFFFFFFFFFFFF); + Value first57BitsOfGlobalAddr = + LLVM::AndOp::create(rewriter, loc, castForGlobalAddr, mask); + Value shift = LLVM::LShrOp::create(rewriter, loc, first57BitsOfGlobalAddr, + createI64Constant(rewriter, loc, 32)); + + Value lowHalf = + LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr); + Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift); + + Value c0 = createI32Constant(rewriter, loc, 0); + Value c1 = createI32Constant(rewriter, loc, 1); + Value c2 = createI32Constant(rewriter, loc, 2); + Value c3 = createI32Constant(rewriter, loc, 3); + + Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + Value result = LLVM::UndefOp::create(rewriter, loc, v4i32); + result = LLVM::InsertElementOp::create(rewriter, loc, result, c0, c0); + result = LLVM::InsertElementOp::create(rewriter, loc, result, + castForLdsAddr, c1); + result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2); + result = LLVM::InsertElementOp::create(rewriter, loc, result, highHalf, c3); + + rewriter.replaceOp(op, result); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -2278,6 +2348,10 @@ struct ConvertAMDGPUToROCDLPass RewritePatternSet patterns(ctx); LLVMTypeConverter converter(ctx); + converter.addConversion([&](TDMBaseType type) -> Type { + Type i32 = IntegerType::get(type.getContext(), 32); + return converter.convertType(VectorType::get(4, i32)); + }); populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset); LLVMConversionTarget target(getContext()); target.addIllegalDialect<::mlir::amdgpu::AMDGPUDialect>(); @@ -2333,6 +2407,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering>(converter, chipset); + AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter, + chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 5ff640b5d1596..8fc6220efc6ad 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -41,6 +41,38 @@ using namespace mlir::amdgpu; #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc" +namespace mlir::amdgpu { +bool hasGlobalMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return true; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; + return false; +} + +bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 3; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; + return false; +} + +bool hasFatRawBufferMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 7; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; + return false; +} +} // namespace mlir::amdgpu + namespace { struct AMDGPUInlinerInterface final : DialectInlinerInterface { using DialectInlinerInterface::DialectInlinerInterface; @@ -158,36 +190,6 @@ LogicalResult FatRawBufferCastOp::verify() { return success(); } -static bool hasGlobalMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return true; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; - return false; -} - -static bool hasWorkgroupMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 3; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; - return false; -} - -static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 7; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; - return false; -} - //===----------------------------------------------------------------------===// // RawBuffer*Op //===----------------------------------------------------------------------===// @@ -705,6 +707,23 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaBaseOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaBaseOp::verify() { + MemRefType srcType = cast(getSrc().getType()); + MemRefType dstType = cast(getDst().getType()); + bool store_from_lds = hasWorkgroupMemorySpace(srcType.getMemorySpace()) && + hasGlobalMemorySpace(dstType.getMemorySpace()); + bool load_to_lds = hasGlobalMemorySpace(srcType.getMemorySpace()) && + hasWorkgroupMemorySpace(dstType.getMemorySpace()); + bool is_valid = store_from_lds != load_to_lds; + if (!is_valid) + return emitOpError("invalid combination of address spaces."); + return success(); +} + //===----------------------------------------------------------------------===// // MakeDmaDescriptorOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir similarity index 73% rename from mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir rename to mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index d2391140ce056..96d03a427215f 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -162,3 +162,77 @@ func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64> return %ret0: vector<16xf64> } + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + +// CHECK-LABEL: func @make_dma_base +// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) +func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { + // CHECK-DAG: %[[INT:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64 + // CHECK-DAG: %[[MEMREF_DESC_MEM:.+]] = builtin.unrealized_conversion_cast %[[MEM]] : memref<8xi32, 1> + // CHECK-DAG: %[[MEMREF_DESC_SMEM:.+]] = builtin.unrealized_conversion_cast %[[SMEM]] : memref<8xi32, 3> + + // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1> + // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3> + + // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]] + // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]] + + // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64 + // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32 + + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64 + // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]] + // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]] + // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 + // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32 + + // CHECK: %[[V4I32_0_0:.+]] = llvm.mlir.undef : vector<4xi32> + // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_0_0]][%[[C0]] : i32] + // CHECK: %[[V4I32_0_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_0_1]][%[[C1]] : i32] + // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32] + // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_0_3]][%[[C3]] : i32] + + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + + // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1> + // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3> + + // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]] + // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]] + + // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64 + // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32 + + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64 + // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]] + // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]] + // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 + // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32 + + // CHECK: %[[V4I32_1_0:.+]] = llvm.mlir.undef : vector<4xi32> + // CHECK: %[[V4I32_1_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_1_0]][%[[C0]] : i32] + // CHECK: %[[V4I32_1_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_1_1]][%[[C1]] : i32] + // CHECK: %[[V4I32_1_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_1_2]][%[[C2]] : i32] + // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_1_3]][%[[C3]] : i32] + + %1 = amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu_lds_addrspace>, memref<8xi32, #gpu_global_addrspace> -> !amdgpu.tdm_base + + func.return %0, %1 : !amdgpu.tdm_base, !amdgpu.tdm_base +} From 3ee5464060d2bae3e071b4910ef09e0b0d4f6728 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 09:41:38 -0500 Subject: [PATCH 4/9] Update documentation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 9cb0752fba48b..1806c747046b8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1258,19 +1258,21 @@ def AMDGPU_MakeDmaBaseOp : to ```mlir - // pseudocode - %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> - %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> - %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> - // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base - - // The base will be used when contructing dgroup0 - // when lowering amdgpu.make_dma_descriptor - %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> - %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... - - // When lowering amdgpu.tensor_load_to_lds - rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + // pseudo-code + %global_base = llvm.extractvalue %global_memref[1] + %global_address = llvm.get_element_ptr ... + + %lds_base = llvm.extractvalue %lds_memref[1] + %lds_address = llvm.get_element_ptr ... + + // Definition of %base + %undef = llvm.mlir.undef : vector<4xi32> + %v0 = llvm.insertelement %15, %undef[0] : vector<4xi32> + %v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32> + %v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32> + %base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32> + + rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> ``` These tensor DMA operations were introduced in gfx1250. From 7aa7699e3de3624c58b026ea9087a63b2033ff61 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 11:10:03 -0500 Subject: [PATCH 5/9] [amdgpu][mlir] make_dma_base add type information. --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 7 ++++++- mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 3316e16a05d5c..452c4e96e62c1 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2316,6 +2316,10 @@ struct AMDGPUMakeDmaBaseLowering LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr); Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift); + Value typeMask = createI32Constant(rewriter, loc, 2 << 30); + Value highHalfPlusType = + LLVM::OrOp::create(rewriter, loc, highHalf, typeMask); + Value c0 = createI32Constant(rewriter, loc, 0); Value c1 = createI32Constant(rewriter, loc, 1); Value c2 = createI32Constant(rewriter, loc, 2); @@ -2327,7 +2331,8 @@ struct AMDGPUMakeDmaBaseLowering result = LLVM::InsertElementOp::create(rewriter, loc, result, castForLdsAddr, c1); result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2); - result = LLVM::InsertElementOp::create(rewriter, loc, result, highHalf, c3); + result = LLVM::InsertElementOp::create(rewriter, loc, result, + highHalfPlusType, c3); rewriter.replaceOp(op, result); return success(); diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 96d03a427215f..514ed9094da53 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -192,6 +192,9 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + // CHECK-DAG: %[[TYPE_MASK:.+]] = llvm.mlir.constant(-2147483648 : i32) + // CHECK: %[[MEM_INT_HIGH_TYPE:.+]] = llvm.or %[[MEM_INT_HIGH]], %[[TYPE_MASK]] + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 @@ -201,7 +204,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_0_0]][%[[C0]] : i32] // CHECK: %[[V4I32_0_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_0_1]][%[[C1]] : i32] // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32] - // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_0_3]][%[[C3]] : i32] + // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH_TYPE]], %[[V4I32_0_3]][%[[C3]] : i32] %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base @@ -221,6 +224,9 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + // CHECK-DAG: %[[TYPE_MASK:.+]] = llvm.mlir.constant(-2147483648 : i32) + // CHECK: %[[MEM_INT_HIGH_TYPE:.+]] = llvm.or %[[MEM_INT_HIGH]], %[[TYPE_MASK]] + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 @@ -230,7 +236,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK: %[[V4I32_1_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_1_0]][%[[C0]] : i32] // CHECK: %[[V4I32_1_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_1_1]][%[[C1]] : i32] // CHECK: %[[V4I32_1_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_1_2]][%[[C2]] : i32] - // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_1_3]][%[[C3]] : i32] + // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH_TYPE]], %[[V4I32_1_3]][%[[C3]] : i32] %1 = amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu_lds_addrspace>, memref<8xi32, #gpu_global_addrspace> -> !amdgpu.tdm_base From 9f37e601e97e024a9c7ed6877acbc22be154a5ab Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 12:40:30 -0500 Subject: [PATCH 6/9] [mlir][amdgpu] Add AllElementTypesMatch attribute to make_dma_base --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 1806c747046b8..23eacab216468 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1228,7 +1228,7 @@ def AMDGPU_ScaledMFMAOp : } def AMDGPU_MakeDmaBaseOp : - AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, + AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["src", "dst"]>]>, Arguments<(ins Arg:$src, Variadic:$src_indices, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 514ed9094da53..272c7b375b9f8 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -169,6 +169,18 @@ func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M #gpu_lds_addrspace = 3 #amdgpu_fat_buffer_addrspace = 7 +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xf32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_base' op failed to verify that all of {src, dst} have same element type}} + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xf32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + return %0 : !amdgpu.tdm_base +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { From 3a427759bdcafde5905582f3e92300cf6eeab0f1 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 13:11:59 -0500 Subject: [PATCH 7/9] [mlir][amdgpu] verify element type sizes for make_dma_base --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 17 +++++++++++++++++ mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 12 ++++++++++++ 2 files changed, 29 insertions(+) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 8fc6220efc6ad..75b4fdb3fbdd5 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -721,6 +721,23 @@ LogicalResult MakeDmaBaseOp::verify() { bool is_valid = store_from_lds != load_to_lds; if (!is_valid) return emitOpError("invalid combination of address spaces."); + + Type elementType = srcType.getElementType(); + int width; + if (auto intType = dyn_cast(elementType)) { + width = intType.getWidth(); + } else if (auto floatType = dyn_cast(elementType)) { + width = floatType.getWidth(); + } else { + return emitOpError("element type must have type width"); + } + + if (!llvm::is_contained({8, 16, 32, 64}, width)) { + return emitOpError( + "element type must be 1, 2, 4, or 8 bytes long but type was ") + << width << " bits long."; + } + return success(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 272c7b375b9f8..172664e8a0e8d 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -181,6 +181,18 @@ func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref< #gpu_lds_addrspace = 3 #amdgpu_fat_buffer_addrspace = 7 +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi7, #gpu_global_addrspace>, %smem: memref<8xi7,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_base' op element type must be 1, 2, 4, or 8 bytes long but type was 7 bits long.}} + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi7, #gpu_global_addrspace>, memref<8xi7, #gpu_lds_addrspace> -> !amdgpu.tdm_base + return %0 : !amdgpu.tdm_base +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { From c0cd803d7e3ee06cd89e77d5d5a45b0adc1242ab Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 14:06:54 -0500 Subject: [PATCH 8/9] [mlir][amdgpu] Lower make_dma_descriptor Initial lowering for make_dma_descriptor. At the moment it only supports tensors of rank 2. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 53 ++- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 416 +++++++++++++++++- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 10 + .../Conversion/AMDGPUToROCDL/gfx1250.mlir | 91 ++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 19 +- mlir/test/Dialect/AMDGPU/ops.mlir | 48 +- 6 files changed, 582 insertions(+), 55 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 23eacab216468..28efa246689a1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1295,8 +1295,8 @@ def AMDGPU_MakeDmaDescriptorOp : DenseI64ArrayAttr: $global_static_strides, Variadic: $shared_dynamic_sizes, DenseI64ArrayAttr: $shared_static_sizes, - Optional: $pad, - Optional: $pad_every, + Optional: $pad_amount, + Optional: $pad_interval, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_indices, Optional: $global_increment, @@ -1332,7 +1332,7 @@ def AMDGPU_MakeDmaDescriptorOp : // Example of moving a two dimension tensor to LDS where padding is applied after every integer. %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base - %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount every %pad_interval) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor ``` }]; @@ -1342,13 +1342,58 @@ def AMDGPU_MakeDmaDescriptorOp : `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) - ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `padShared` `(` $pad_amount^ `every` $pad_interval`)` )? ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? attr-dict `:` qualified(type($base)) `->` type(results) }]; + let extraClassDeclaration = [{ + int getRank() { + return getGlobalStaticSizes().size(); + } + + int getElementTypeWidth() { + Type elementType = getBase().getType().getElementType(); + int width; + if (auto floatType = dyn_cast(elementType)) { + width = floatType.getWidth(); + } else if (auto intType = dyn_cast(elementType)) { + width = intType.getWidth(); + } else { + llvm_unreachable("element type must have getWidth interface"); + } + return width; + } + + SmallVector getMixedList(SmallVector dynamics, ArrayRef statics) { + SmallVector result; + unsigned ctr = 0; + OpBuilder b(getContext()); + for (int64_t static_elem : statics) { + if (ShapedType::isDynamic(static_elem)) { + result.push_back(dynamics[ctr++]); + } else { + result.push_back(b.getIndexAttr(static_elem)); + } + } + return result; + } + + SmallVector getMixedGlobalSizes() { + return getMixedList(getGlobalDynamicSizes(), getGlobalStaticSizes()); + } + + SmallVector getMixedGlobalStrides() { + return getMixedList(getGlobalDynamicStrides(), getGlobalStaticStrides()); + } + + SmallVector getMixedSharedSizes() { + return getMixedList(getSharedDynamicSizes(), getSharedStaticSizes()); + } + }]; + let hasVerifier = 1; } diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 452c4e96e62c1..1e81d339b0ddc 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2275,8 +2275,9 @@ struct AMDGPUMakeDmaBaseLowering LogicalResult matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (chipset < kGfx1250) + if (chipset < kGfx1250) { return op->emitOpError("make_dma_base is only supported on gfx1250"); + } Location loc = op.getLoc(); @@ -2339,6 +2340,375 @@ struct AMDGPUMakeDmaBaseLowering } }; +struct AMDGPUMakeDmaDescriptorLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + AMDGPUMakeDmaDescriptorLowering(const LLVMTypeConverter &converter, + Chipset chipset) + : ConvertOpToLLVMPattern(converter), + chipset(chipset) {} + Chipset chipset; + + Value getDGroup0(OpAdaptor adaptor) const { return adaptor.getBase(); } + + Value setValueAtOffset(ConversionPatternRewriter &rewriter, Location loc, + Value accumulator, Value value, int shift) const { + shift = shift % 32; + Value shiftAmount; + if (shift != 0) { + shiftAmount = createI32Constant(rewriter, loc, shift % 32); + value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount); + } + return LLVM::OrOp::create(rewriter, loc, accumulator, value); + } + + Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector consts) const { + // Compute data_size. + int elementTypeWidthInBytes = op.getElementTypeWidth() / 8; + + Value dataSize; + switch (elementTypeWidthInBytes) { + case 1: + dataSize = consts[0]; + break; + case 2: + dataSize = consts[1]; + break; + case 4: + dataSize = consts[2]; + break; + case 8: + dataSize = consts[3]; + break; + default: + llvm_unreachable("Invalid element size."); + } + return setValueAtOffset(rewriter, loc, sgpr0, dataSize, 16); + } + + Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; + if (!atomic_barrier_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18); + } + + Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool iterate_enable = adaptor.getGlobalIncrement() != nullptr; + if (!iterate_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19); + } + + Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20); + } + + Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + IntegerType i32 = rewriter.getI32Type(); + Value padInterval = adaptor.getPadInterval(); + // pre-condition: padInterval can be a power of two between 2 and 256 + padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32, + padInterval, false); + padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]); + // post-condition: padInterval can be a value between 0 and 7 + return setValueAtOffset(rewriter, loc, sgpr0, padInterval, 22); + } + + Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + Value padAmount = adaptor.getPadAmount(); + // pre-condition: padAmount is a value between 1-128 + padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]); + // post-condition: padAmount is a value between 0-127 + return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25); + } + + Value setAtomicBarrierAddress(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr1, + const SmallVector &consts) const { + bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; + if (!atomic_barrier_enable) + return sgpr1; + + Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress(); + IntegerType i32 = rewriter.getI32Type(); + atomicBarrierAddress = + LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress); + atomicBarrierAddress = + LLVM::LShrOp::create(rewriter, loc, atomicBarrierAddress, consts[3]); + Value mask = createI32Constant(rewriter, loc, 0xFFFF); + atomicBarrierAddress = + LLVM::AndOp::create(rewriter, loc, atomicBarrierAddress, mask); + return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32); + } + + std::pair + setTensorDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, Value sgpr1, + Value sgpr2, const SmallVector &consts) const { + SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); + OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back(); + Value tensorDim0; + if (auto attr = dyn_cast(tensorDim0OpFoldResult)) { + tensorDim0 = + createI32Constant(rewriter, loc, cast(attr).getInt()); + } else { + tensorDim0 = cast(tensorDim0OpFoldResult); + } + Value c16 = createI32Constant(rewriter, loc, 16); + Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16); + sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48); + sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16); + return {sgpr1, sgpr2}; + } + + std::pair + setTensorDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, Value sgpr2, + Value sgpr3, const SmallVector &consts) const { + SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); + OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1); + Value tensorDim1; + if (auto attr = dyn_cast(tensorDim1OpFoldResult)) { + tensorDim1 = + createI32Constant(rewriter, loc, cast(attr).getInt()); + } else { + tensorDim1 = cast(tensorDim1OpFoldResult); + } + Value c16 = createI32Constant(rewriter, loc, 16); + Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16); + sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80); + sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16); + return {sgpr2, sgpr3}; + } + + Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr, const SmallVector &consts, size_t dimX, + int offset) const { + SmallVector mixedSharedSizes = op.getMixedSharedSizes(); + + if (mixedSharedSizes.size() <= dimX) { + return sgpr; + } + + OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX); + Value tileDimX; + if (auto attr = dyn_cast(tileDimXOpFoldResult)) { + tileDimX = + createI32Constant(rewriter, loc, cast(attr).getInt()); + } else { + tileDimX = cast(tileDimXOpFoldResult); + } + return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset); + } + + Value setTileDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr3, const SmallVector &consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr3, consts, 0, 112); + } + + Value setTileDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr4, const SmallVector &consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 1, 128); + } + + Value setTileDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr4, const SmallVector &consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 2, 144); + } + + std::pair + setTensorDimXStride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgprY, Value sgprZ, + const SmallVector &consts, size_t dimX, + int offset) const { + SmallVector mixedGlobalStrides = op.getMixedGlobalStrides(); + + if (mixedGlobalStrides.size() <= dimX) { + return {sgprY, sgprZ}; + } + + OpFoldResult tensorDimXStrideOpFoldResult = + *(mixedGlobalStrides.rbegin() + dimX); + Value tensorDimXStride; + if (auto attr = dyn_cast(tensorDimXStrideOpFoldResult)) { + tensorDimXStride = + createI64Constant(rewriter, loc, cast(attr).getInt()); + } else { + tensorDimXStride = cast(tensorDimXStrideOpFoldResult); + } + + constexpr int64_t first48bits = 0xFFFFFFFFFFFF; + Value mask = createI64Constant(rewriter, loc, first48bits); + tensorDimXStride = + LLVM::AndOp::create(rewriter, loc, mask, tensorDimXStride); + IntegerType i32 = rewriter.getI32Type(); + Value tensorDimXStrideLow = + LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride); + + int shift = (offset % 32) == 0 ? 32 : offset % 32; + Value shiftVal = createI64Constant(rewriter, loc, shift); + Value tensorDimXStrideHigh = + LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal); + tensorDimXStrideHigh = + LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh); + + sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset); + sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh, + offset + shift); + return {sgprY, sgprZ}; + } + + std::pair + setTensorDim0Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr5, Value sgpr6, + const SmallVector &consts) const { + return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, + 0, 160); + } + + std::pair + setTensorDim1Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr5, Value sgpr6, + const SmallVector &consts) const { + return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, + 1, 208); + } + + Value getDGroup1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + const SmallVector &consts) const { + + Value sgpr0, sgpr1, sgpr2, sgpr3, sgpr4, sgpr5, sgpr6, sgpr7; + sgpr0 = sgpr1 = sgpr2 = sgpr3 = sgpr4 = sgpr5 = sgpr6 = sgpr7 = consts[0]; + + sgpr0 = setDataSize(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setAtomicBarrier(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setIterateEnable(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setPadEnable(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setPadInterval(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setPadAmount(op, adaptor, rewriter, loc, sgpr0, consts); + + sgpr1 = setAtomicBarrierAddress(op, adaptor, rewriter, loc, sgpr1, consts); + std::tie(sgpr1, sgpr2) = + setTensorDim0(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts); + std::tie(sgpr2, sgpr3) = + setTensorDim1(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts); + + sgpr3 = setTileDim0(op, adaptor, rewriter, loc, sgpr3, consts); + sgpr4 = setTileDim1(op, adaptor, rewriter, loc, sgpr4, consts); + sgpr4 = setTileDim2(op, adaptor, rewriter, loc, sgpr4, consts); + std::tie(sgpr5, sgpr6) = + setTensorDim0Stride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts); + std::tie(sgpr6, sgpr7) = + setTensorDim1Stride(op, adaptor, rewriter, loc, sgpr6, sgpr7, consts); + + IntegerType i32 = rewriter.getI32Type(); + Type v8i32 = this->typeConverter->convertType(VectorType::get(8, i32)); + Value dgroup1 = LLVM::UndefOp::create(rewriter, loc, v8i32); + + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr0, consts[0]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr1, consts[1]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr2, consts[2]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr3, consts[3]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr4, consts[4]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr5, consts[5]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr6, consts[6]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr7, consts[7]); + + return dgroup1; + } + + LogicalResult + matchAndRewrite(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) { + return op->emitOpError( + "make_dma_descriptor is only supported on gfx1250"); + } + + if (op.getRank() != 2) { + return op->emitOpError("unimplemented"); + } + + Location loc = op.getLoc(); + + IntegerType i32 = rewriter.getI32Type(); + Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + + SmallVector consts; + for (int i = 0; i < 8; i++) { + consts.push_back(createI32Constant(rewriter, loc, i)); + } + + Value dgroup0 = this->getDGroup0(adaptor); + Value dgroup1 = this->getDGroup1(op, adaptor, rewriter, loc, consts); + Value undefV4I32 = LLVM::UndefOp::create(rewriter, loc, v4i32); + Value dgroup2 = undefV4I32; + Value dgroup3 = undefV4I32; + + if (op.getRank() == 2) { + Value nullConstant = createI32Constant(rewriter, loc, 0x7c); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, + nullConstant, consts[0]); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, consts[0], + consts[1]); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, consts[0], + consts[2]); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, consts[0], + consts[3]); + dgroup3 = dgroup2; + } + + SmallVector results = {dgroup0, dgroup1, dgroup2, dgroup3}; + rewriter.replaceOpWithMultiple(op, {results}); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -2392,27 +2762,27 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, Chipset chipset) { populateAMDGPUMemorySpaceAttributeConversions(converter); - patterns - .add, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, - SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, - WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, - ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, - PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, - GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter, - chipset); + patterns.add< + FatRawBufferCastLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, + SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, + WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, + ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, + PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, + GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering, + AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter, + chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 75b4fdb3fbdd5..42797dadbb7e0 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -757,6 +757,9 @@ LogicalResult MakeDmaDescriptorOp::verify() { ArrayRef globalStaticSizes = getGlobalStaticSizes(); size_t rank = globalStaticSizes.size(); + if (rank < 2) { + return emitOpError("tensor and tile must be at least of rank 2."); + } if (rank != globalStaticStrides.size()) { return emitOpError("strides and sizes must have same rank."); } @@ -766,6 +769,13 @@ LogicalResult MakeDmaDescriptorOp::verify() { return emitOpError("tensor must have same rank as tile."); } + int elementTypeWidth = getElementTypeWidth(); + if (!llvm::is_contained({8, 16, 32, 64}, elementTypeWidth)) { + return emitOpError( + "element type width must be 1, 2, 4 or 8 bytes, but was ") + << elementTypeWidth << " bits long"; + } + return success(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 172664e8a0e8d..e774cc8ca4f70 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -266,3 +266,94 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> func.return %0, %1 : !amdgpu.tdm_base, !amdgpu.tdm_base } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_descriptor { + // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]] + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) + // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32) + // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32) + // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32) + // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32) + + // CHECK: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SIZE:.+]] = llvm.shl %[[C2]], %[[C16]] + // CHECK: %[[SGPR0:.+]] = llvm.or %[[C0]], %[[SIZE]] + + // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_0_HIGH:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_0_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]] + // CHECK: %[[SGPR1:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_SHIFTED]] + // CHECK: %[[SGPR2_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_HIGH]] + + // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_1_HIGH:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]] + // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]] + // CHECK: %[[SGPR3_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_1_HIGH]] + + // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]] + // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]] + + // CHECK-DAG: %[[TILE_DIM_1:.+]] = llvm.mlir.constant(128 : i32) + // CHECK: %[[SGPR4:.+]] = llvm.or %[[C0]], %[[TILE_DIM_1]] + + // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 + // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]] + // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32 + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]] + // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32 + // CHECK: %[[SGPR5:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_STRIDE_LOW]] + // CHECK: %[[SGPR6_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_STRIDE_HIGH]] + + // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64) + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 + // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]] + // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]] + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64 + // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]] + // CHECK: %[[TENSOR_DIM_1_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32 + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]] + // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]] + // CHECK-DAG: %[[SGPR7:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_1_STRIDE_HIGH]] + + // CHECK: %[[V8I32:.+]] = llvm.mlir.undef : vector<8xi32> + // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32] + // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32] + // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32] + // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32] + // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32] + // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32] + // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32] + // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32] + + // CHECK-DAG: %[[V4I32:.+]] = llvm.mlir.undef : vector<4xi32> + + // CHECK-DAG: %[[NULL:.+]] = llvm.mlir.constant(124 : i32) + + // CHECK: %[[NULL_GROUP_0:.+]] = llvm.insertelement %[[NULL]], %[[V4I32]][%[[C0]] : i32] + // CHECK: %[[NULL_GROUP_1:.+]] = llvm.insertelement %[[C0]], %[[NULL_GROUP_0]][%[[C1]] : i32] + // CHECK: %[[NULL_GROUP_2:.+]] = llvm.insertelement %[[C0]], %[[NULL_GROUP_1]][%[[C2]] : i32] + // CHECK: %[[NULL_GROUP:.+]] = llvm.insertelement %[[C0]], %[[NULL_GROUP_2]][%[[C3]] : i32] + + // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[NULL_GROUP]], %[[NULL_GROUP]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %descriptor : !amdgpu.tdm_descriptor +} + diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 066f46060f62f..2374124e2a083 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -361,7 +361,7 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -371,7 +371,7 @@ func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} - amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -381,7 +381,7 @@ func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base< // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} - amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -391,6 +391,17 @@ func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_ // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} - amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + + +// CHECK-LABEL: func @make_dma_descriptor_invalid_rank_less_than_two +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_rank_less_than_two(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor and tile must be at least of rank 2.}} + amdgpu.make_dma_descriptor %base globalSize [4] globalStride [1, 1] sharedSize [1, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index aa6bedc0e1135..79cb75d782c05 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -703,45 +703,45 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor - sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) padShared(%idx every %idx) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] iterate %idx, %idx, %idx : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor From 2973181cd611bab90235d6f5d0b4c8f28194f560 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Fri, 28 Nov 2025 14:04:00 -0500 Subject: [PATCH 9/9] Folding --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 1 + mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 79 +++++++++++++++++++ .../amdgpu-make-dma-descriptor-fold.mlir | 19 +++++ mlir/test/Dialect/AMDGPU/ops.mlir | 1 + 4 files changed, 100 insertions(+) create mode 100644 mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 28efa246689a1..c072ebdfa5d26 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1395,6 +1395,7 @@ def AMDGPU_MakeDmaDescriptorOp : }]; let hasVerifier = 1; + let hasFolder = 1; } def AMDGPU_TensorLoadToLDSOp : diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 42797dadbb7e0..002381ce8a8eb 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -760,6 +760,9 @@ LogicalResult MakeDmaDescriptorOp::verify() { if (rank < 2) { return emitOpError("tensor and tile must be at least of rank 2."); } + if (rank > 5) { + return emitOpError("tensor and tile must be at most of rank 5."); + } if (rank != globalStaticStrides.size()) { return emitOpError("strides and sizes must have same rank."); } @@ -779,6 +782,82 @@ LogicalResult MakeDmaDescriptorOp::verify() { return success(); } +static bool maybeUpdateDynamicIndexList( + ArrayRef staticElements, ArrayRef foldedElements, + SmallVector dynamicElements, SmallVector &newStaticElements, + SmallVector &newDynamicElements) { + bool changed = false; + int index = 0; + + for (int64_t static_element : staticElements) { + if (!ShapedType::isDynamic(static_element)) { + newStaticElements.push_back(static_element); + continue; + } + + Attribute folded_element = foldedElements[index++]; + if (auto attr = dyn_cast(folded_element)) { + newStaticElements.push_back(attr.getInt()); + changed = true; + continue; + } + + newStaticElements.push_back(ShapedType::kDynamic); + newDynamicElements.push_back(dynamicElements[index]); + } + return changed; +} + +OpFoldResult MakeDmaDescriptorOp::fold(FoldAdaptor adaptor) { + ArrayRef oldGlobalStaticStrides = adaptor.getGlobalStaticStrides(); + ArrayRef foldedGlobalDynamicStrides = + adaptor.getGlobalDynamicStrides(); + SmallVector oldGlobalDynamicStrides = getGlobalDynamicStrides(); + + SmallVector newGlobalStaticStrides; + SmallVector newGlobalDynamicStrides; + + bool change = maybeUpdateDynamicIndexList( + oldGlobalStaticStrides, foldedGlobalDynamicStrides, + oldGlobalDynamicStrides, newGlobalStaticStrides, newGlobalDynamicStrides); + + ArrayRef oldGlobalStaticSizes = adaptor.getGlobalStaticSizes(); + ArrayRef foldedGlobalDynamicSizes = + adaptor.getGlobalDynamicSizes(); + SmallVector oldGlobalDynamicSizes = getGlobalDynamicSizes(); + + SmallVector newGlobalStaticSizes; + SmallVector newGlobalDynamicSizes; + + change |= maybeUpdateDynamicIndexList( + oldGlobalStaticSizes, foldedGlobalDynamicSizes, oldGlobalDynamicSizes, + newGlobalStaticSizes, newGlobalDynamicSizes); + + ArrayRef oldSharedStaticSizes = adaptor.getSharedStaticSizes(); + ArrayRef foldedSharedDynamicSizes = + adaptor.getSharedDynamicSizes(); + SmallVector oldSharedDynamicSizes = getSharedDynamicSizes(); + + SmallVector newSharedStaticSizes; + SmallVector newSharedDynamicSizes; + + change |= maybeUpdateDynamicIndexList( + oldSharedStaticSizes, foldedSharedDynamicSizes, oldSharedDynamicSizes, + newSharedStaticSizes, newSharedDynamicSizes); + + if (change) { + setGlobalStaticStrides(newGlobalStaticStrides); + getGlobalDynamicStridesMutable().assign(newGlobalDynamicStrides); + setGlobalStaticSizes(newGlobalStaticSizes); + getGlobalDynamicSizesMutable().assign(newGlobalDynamicSizes); + setSharedStaticSizes(newSharedStaticSizes); + getSharedDynamicSizesMutable().assign(newSharedDynamicSizes); + return getResult(); + } + + return nullptr; +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir new file mode 100644 index 0000000000000..9d43c9940f8e0 --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir @@ -0,0 +1,19 @@ +// RUN: mlir-opt --canonicalize %s | FileCheck %s + +// CHECK-LABEL: @make_dma_descriptor_fold +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[IDX:.+]]: index) +func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base, %idx: index) -> !amdgpu.tdm_descriptor { + %c64 = arith.constant 64 : index + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + %0 = amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [64, 64] + globalSize [%c64, %c64] + // CHECK-SAME: globalStride [64, 1] + globalStride [%c64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [%c64, %c64] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %0 : !amdgpu.tdm_descriptor +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 79cb75d782c05..7da995df77037 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -758,3 +758,4 @@ func.func @tensor_load_store(%desc: !amdgpu.tdm_descriptor) { amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor return } +