From a4a1a59d894aae479a1bd5aebe2705431b6588b5 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Fri, 21 Nov 2025 12:56:29 -0500 Subject: [PATCH 01/29] [mlir][amdgpu] Add make_dma_base operation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 55 +++++++++++++++++++ .../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 4 ++ mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 7 +++ mlir/test/Dialect/AMDGPU/ops.mlir | 12 ++++ 4 files changed, 78 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 4820b7a747ac2..04043f47c3539 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -33,6 +33,7 @@ def AMDGPU_Dialect : Dialect { "gpu::GPUDialect" ]; let useDefaultAttributePrinterParser = 1; + let useDefaultTypePrinterParser = 1; } def AnyIntegerOrFloat : AnyTypeOf<[AnySignlessInteger, AnyFloat], "Integer or Float">; @@ -79,6 +80,36 @@ def AMDGPU_AddressSpaceAttr : EnumAttr traits = []> + : TypeDef { + let mnemonic = typeMnemonic; +} + +//===----------------------------------------------------------------------===// +// AMDGPU Type definitions +//===----------------------------------------------------------------------===// + +def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { + // TODO: + // * Add verifiers such that one of the memrefs is from LDS and the other global. + // * Add verifiers to make sure that the type is in the correct direction. + // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. + + let summary = "Pair of base addresses that move data between LDS and global storage."; + let description = [{ + This type is opaque and it is used to represent a struct of two addresses. + One address is in LDS while the other is in global memory. + }]; + let parameters = (ins "Type":$elementType); + let builders = [ + TypeBuilderWithInferredContext<(ins "Type":$elementType), [{ + return $_get(elementType.getContext(), elementType); + }]> + ]; + let assemblyFormat = "`<` $elementType `>`"; + +} + //===----------------------------------------------------------------------===// // AMDGPU Op definitions //===----------------------------------------------------------------------===// @@ -1192,4 +1223,28 @@ def AMDGPU_ScaledMFMAOp : }]; let hasCanonicalizer = 1; } + +def AMDGPU_MakeDmaBaseOp : + AMDGPU_Op<"make_dma_base", [AttrSizedOperandSegments]>, + Arguments<(ins + Arg:$src, + Variadic:$srcIndices, + Arg:$dst, + Variadic:$dstIndices)>, + Results<(outs AMDGPU_TDMBaseType: $base)> { + + let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; + let description = [{ + This operation creates a pair of addresses that will be used by tensor_load_to_lds + and tensor_store_from_lds. + + This operation creates a value corresponding roughly to the descriptor group 0 + found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect. + }]; + + let assemblyFormat = [{ + $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results) + }]; +} + #endif // AMDGPU diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h index dcd9f95a7561f..a7680fb5c3191 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -25,6 +25,7 @@ #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc" #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.h.inc" namespace mlir::amdgpu { /// Parser for the `custom` custom assembly format used by @@ -52,6 +53,9 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *, #define GET_ATTRDEF_CLASSES #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc" +#define GET_TYPEDEF_CLASSES +#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.h.inc" + #define GET_OP_CLASSES #include "mlir/Dialect/AMDGPU/IR/AMDGPU.h.inc" diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index d55f3cec47c1f..cdc10c60a42ae 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -55,6 +55,10 @@ void AMDGPUDialect::initialize() { #define GET_OP_LIST #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc" >(); + addTypes< +#define GET_TYPEDEF_LIST +#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.cpp.inc" + >(); addAttributes< #define GET_ATTRDEF_LIST #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc" @@ -839,5 +843,8 @@ void ScaledMFMAOp::getCanonicalizationPatterns(RewritePatternSet &results, #define GET_ATTRDEF_CLASSES #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc" +#define GET_TYPEDEF_CLASSES +#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.cpp.inc" + #define GET_OP_CLASSES #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc" diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 09134cb4704bb..653f9f64d24f4 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -685,3 +685,15 @@ func.func @memory_counter_wait() { amdgpu.memory_counter_wait exp(4) func.return } + +// CHECK-LABEL: func @make_dma_base +// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) +func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + func.return +} + From d14f3e28cc79774adb744ae6ee6d98684f120fa7 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 09:18:26 -0500 Subject: [PATCH 02/29] Remove MemRead and MemWrite from operation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 04043f47c3539..990d377dc9d7b 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1227,9 +1227,9 @@ def AMDGPU_ScaledMFMAOp : def AMDGPU_MakeDmaBaseOp : AMDGPU_Op<"make_dma_base", [AttrSizedOperandSegments]>, Arguments<(ins - Arg:$src, + Arg:$src, Variadic:$srcIndices, - Arg:$dst, + Arg:$dst, Variadic:$dstIndices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { From d3ca18c937218a8f115e58b0a6d4d5b10bdc187a Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 09:20:38 -0500 Subject: [PATCH 03/29] Add Pure to make_dma_base --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 990d377dc9d7b..645fc4655025a 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1225,7 +1225,7 @@ def AMDGPU_ScaledMFMAOp : } def AMDGPU_MakeDmaBaseOp : - AMDGPU_Op<"make_dma_base", [AttrSizedOperandSegments]>, + AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, Variadic:$srcIndices, From 76e47f147ea84ec13d0a0afac5d5d2b963b9b49f Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 12:11:43 -0500 Subject: [PATCH 04/29] Add DynamicIndexList --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 44 +++++++++++++++---- mlir/test/Dialect/AMDGPU/ops.mlir | 8 ++++ 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 645fc4655025a..e2fd78dab7ebf 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -80,21 +80,17 @@ def AMDGPU_AddressSpaceAttr : EnumAttr traits = []> - : TypeDef { - let mnemonic = typeMnemonic; -} //===----------------------------------------------------------------------===// // AMDGPU Type definitions //===----------------------------------------------------------------------===// -def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { - // TODO: - // * Add verifiers such that one of the memrefs is from LDS and the other global. - // * Add verifiers to make sure that the type is in the correct direction. - // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. +class AMDGPU_Type traits = []> + : TypeDef { + let mnemonic = typeMnemonic; +} +def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let summary = "Pair of base addresses that move data between LDS and global storage."; let description = [{ This type is opaque and it is used to represent a struct of two addresses. @@ -107,6 +103,14 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { }]> ]; let assemblyFormat = "`<` $elementType `>`"; +} + +def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> { + let summary = "Descriptors used in tensor store/load operations."; + let description = [{ + This type is opaque and corresponds to the two or four descriptor groups + used in tensor_load_to_lds or tensor_store_from_lds. + }]; } @@ -1233,6 +1237,10 @@ def AMDGPU_MakeDmaBaseOp : Variadic:$dstIndices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { + // TODO: + // * Add verifiers such that one of the memrefs is from LDS and the other global. + // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. + let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; let description = [{ This operation creates a pair of addresses that will be used by tensor_load_to_lds @@ -1247,4 +1255,22 @@ def AMDGPU_MakeDmaBaseOp : }]; } +def AMDGPU_MakeDmaDescriptorOp : + AMDGPU_Op<"make_dma_descriptor", [Pure]>, + Arguments<(ins + AMDGPU_TDMBaseType: $base, + Variadic: $dynamic_sizes, + OptionalAttr: $static_sizes)>, + Results<(outs AMDGPU_TDMDescriptorType: $desc)> { + + let summary = "TODO"; + let description = [{ + TODO + }]; + + let assemblyFormat = [{ + $base `globalSize` custom($dynamic_sizes, $static_sizes) attr-dict `:` qualified(type($base)) `to` type(results) + }]; +} + #endif // AMDGPU diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 653f9f64d24f4..818fd1afa2dc5 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -697,3 +697,11 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, func.return } +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) { + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + func.return +} + From f6f67e39b85c97c39445fa436462c2da916dec40 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 14:10:02 -0500 Subject: [PATCH 05/29] Add globalStride --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 11 ++++++++--- mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index e2fd78dab7ebf..b08039064adff 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1256,11 +1256,13 @@ def AMDGPU_MakeDmaBaseOp : } def AMDGPU_MakeDmaDescriptorOp : - AMDGPU_Op<"make_dma_descriptor", [Pure]>, + AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>, Arguments<(ins AMDGPU_TDMBaseType: $base, Variadic: $dynamic_sizes, - OptionalAttr: $static_sizes)>, + OptionalAttr: $static_sizes, + Variadic: $dynamic_strides, + OptionalAttr: $static_strides)>, Results<(outs AMDGPU_TDMDescriptorType: $desc)> { let summary = "TODO"; @@ -1269,7 +1271,10 @@ def AMDGPU_MakeDmaDescriptorOp : }]; let assemblyFormat = [{ - $base `globalSize` custom($dynamic_sizes, $static_sizes) attr-dict `:` qualified(type($base)) `to` type(results) + $base + `globalSize` custom($dynamic_sizes, $static_sizes) + `globalStride` custom($dynamic_strides, $static_strides) + attr-dict `:` qualified(type($base)) `to` type(results) }]; } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 818fd1afa2dc5..a36f59718f175 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -700,8 +700,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, // CHECK-LABEL: func @make_dma_descriptor // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) { - // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor - amdgpu.make_dma_descriptor %base globalSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] globalStride [1] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor func.return } From 1e2668c8c4dfcf5588c49bfeae1a65be2ae15a98 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 14:25:24 -0500 Subject: [PATCH 06/29] Add verifier for innermost dimension --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 14 ++++++++------ mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 11 +++++++++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 10 ++++++++++ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index b08039064adff..e0a356533144d 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1259,10 +1259,10 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>, Arguments<(ins AMDGPU_TDMBaseType: $base, - Variadic: $dynamic_sizes, - OptionalAttr: $static_sizes, - Variadic: $dynamic_strides, - OptionalAttr: $static_strides)>, + Variadic: $global_dynamic_sizes, + OptionalAttr: $global_static_sizes, + Variadic: $global_dynamic_strides, + OptionalAttr: $global_static_strides)>, Results<(outs AMDGPU_TDMDescriptorType: $desc)> { let summary = "TODO"; @@ -1272,10 +1272,12 @@ def AMDGPU_MakeDmaDescriptorOp : let assemblyFormat = [{ $base - `globalSize` custom($dynamic_sizes, $static_sizes) - `globalStride` custom($dynamic_strides, $static_strides) + `globalSize` custom($global_dynamic_sizes, $global_static_sizes) + `globalStride` custom($global_dynamic_strides, $global_static_strides) attr-dict `:` qualified(type($base)) `to` type(results) }]; + + let hasVerifier = 1; } #endif // AMDGPU diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index cdc10c60a42ae..4ade1164317af 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -705,6 +705,17 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaDescriptorOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaDescriptorOp::verify() { + if (getGlobalStaticStrides()->back() != 1) { + return emitOpError("strides for the innermost dimension must be 1."); + } + return success(); +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 61fdf29a78cbd..f820060d2c718 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -354,3 +354,13 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> func.return %0 : vector<16xf32> } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + func.return +} From f1df3c5b9722cae7000d2c9584345befd7827dc9 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 14:32:44 -0500 Subject: [PATCH 07/29] Add sharedSize --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 5 ++++- mlir/test/Dialect/AMDGPU/invalid.mlir | 2 +- mlir/test/Dialect/AMDGPU/ops.mlir | 10 ++++++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index e0a356533144d..16ef34d1486cb 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1262,7 +1262,9 @@ def AMDGPU_MakeDmaDescriptorOp : Variadic: $global_dynamic_sizes, OptionalAttr: $global_static_sizes, Variadic: $global_dynamic_strides, - OptionalAttr: $global_static_strides)>, + OptionalAttr: $global_static_strides, + Variadic: $shared_dynamic_sizes, + OptionalAttr: $shared_static_sizes)>, Results<(outs AMDGPU_TDMDescriptorType: $desc)> { let summary = "TODO"; @@ -1274,6 +1276,7 @@ def AMDGPU_MakeDmaDescriptorOp : $base `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) + `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) attr-dict `:` qualified(type($base)) `to` type(results) }]; diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index f820060d2c718..e8a0bfe9476a7 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -361,6 +361,6 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor func.return } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index a36f59718f175..0db84a187ddf5 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -700,8 +700,14 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, // CHECK-LABEL: func @make_dma_descriptor // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) { - // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] globalStride [1] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + // CHECK-SAME: globalSize [0] + // CHECK-SAME: globalStride [1] + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base + globalSize [0] + globalStride [1] + sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor func.return } From a24a840b4eb2a2d2daefc91d8b32738fc48cb9d4 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 15:11:16 -0500 Subject: [PATCH 08/29] Add optional atomic barrier --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 16ef34d1486cb..d73e35ce82806 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1264,7 +1264,10 @@ def AMDGPU_MakeDmaDescriptorOp : Variadic: $global_dynamic_strides, OptionalAttr: $global_static_strides, Variadic: $shared_dynamic_sizes, - OptionalAttr: $shared_static_sizes)>, + OptionalAttr: $shared_static_sizes, + Optional: $atomic_barrier_address, + Variadic: $atomic_barrier_dynamic_indices, + OptionalAttr: $atomic_barrier_static_indices)>, Results<(outs AMDGPU_TDMDescriptorType: $desc)> { let summary = "TODO"; @@ -1274,10 +1277,13 @@ def AMDGPU_MakeDmaDescriptorOp : let assemblyFormat = [{ $base - `globalSize` custom($global_dynamic_sizes, $global_static_sizes) - `globalStride` custom($global_dynamic_strides, $global_static_strides) - `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) - attr-dict `:` qualified(type($base)) `to` type(results) + `globalSize` custom($global_dynamic_sizes, $global_static_sizes) + `globalStride` custom($global_dynamic_strides, $global_static_strides) + `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) + ( `atomicBarrier` `(` $atomic_barrier_address^ + custom($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices) + `:` type($atomic_barrier_address) `)`)? + attr-dict `:` qualified(type($base)) `to` type(results) }]; let hasVerifier = 1; From ccaf771d1fa91476adc4454991621d2e1d31d412 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 15:32:17 -0500 Subject: [PATCH 09/29] Add iterate --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 6 +++- mlir/test/Dialect/AMDGPU/ops.mlir | 36 ++++++++++++++++--- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d73e35ce82806..8c04e45a1983e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1267,7 +1267,10 @@ def AMDGPU_MakeDmaDescriptorOp : OptionalAttr: $shared_static_sizes, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_dynamic_indices, - OptionalAttr: $atomic_barrier_static_indices)>, + OptionalAttr: $atomic_barrier_static_indices, + Optional: $global_increment, + Optional: $lds_increment, + Optional: $iteration_count)>, Results<(outs AMDGPU_TDMDescriptorType: $desc)> { let summary = "TODO"; @@ -1283,6 +1286,7 @@ def AMDGPU_MakeDmaDescriptorOp : ( `atomicBarrier` `(` $atomic_barrier_address^ custom($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices) `:` type($atomic_barrier_address) `)`)? + ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? attr-dict `:` qualified(type($base)) `to` type(results) }]; diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 0db84a187ddf5..6df7c300e5bc7 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -698,16 +698,42 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, } // CHECK-LABEL: func @make_dma_descriptor -// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) -func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) { +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { // CHECK: amdgpu.make_dma_descriptor %[[BASE]] - // CHECK-SAME: globalSize [0] - // CHECK-SAME: globalStride [1] - // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] globalSize [0] + // CHECK-SAME: globalStride [1] globalStride [1] + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>) + atomicBarrier(%barrier [0] : memref<8xi32>) + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + iterate %idx, %idx, %idx + // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + + func.return } From 566d2e61a7f372e2308de60e6c9a224fcd309954 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 16:13:51 -0500 Subject: [PATCH 10/29] [mlir][amdgpu] Add make_dma_descriptor. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 5 ++++ mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 22 ++++++++++++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 25 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 8c04e45a1983e..d33605220c442 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1265,6 +1265,10 @@ def AMDGPU_MakeDmaDescriptorOp : OptionalAttr: $global_static_strides, Variadic: $shared_dynamic_sizes, OptionalAttr: $shared_static_sizes, + Optional: $pad, + OptionalAttr: $pad_const, + Optional: $every, + OptionalAttr: $every_const, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_dynamic_indices, OptionalAttr: $atomic_barrier_static_indices, @@ -1283,6 +1287,7 @@ def AMDGPU_MakeDmaDescriptorOp : `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) + ( `padShared` `(` custom($pad, $pad_const)^ `every` custom($every, $every_const) `)` )? ( `atomicBarrier` `(` $atomic_barrier_address^ custom($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices) `:` type($atomic_barrier_address) `)`)? diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 4ade1164317af..b382fec21f20a 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -50,6 +50,28 @@ struct AMDGPUInlinerInterface final : DialectInlinerInterface { }; } // namespace +static ParseResult +parseDynamicIndex(OpAsmParser &parser, + std::optional dynamicSize, + IntegerAttr &staticSize) { + int64_t staticVal; + if (parser.parseOptionalInteger(staticVal).has_value()) { + staticSize = parser.getBuilder().getIndexAttr(staticVal); + return success(); + } + + return parser.parseOperand(dynamicSize.value()); +} + +static void printDynamicIndex(OpAsmPrinter &printer, Operation *op, + Value dynamicSize, IntegerAttr staticSize) { + if (staticSize) { + printer << staticSize.getValue(); + } else { + printer << dynamicSize; + } +} + void AMDGPUDialect::initialize() { addOperations< #define GET_OP_LIST diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 6df7c300e5bc7..36a4f1644c28a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -700,6 +700,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, // CHECK-LABEL: func @make_dma_descriptor // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index) func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] @@ -709,6 +710,30 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: padShared(1 every 1) + padShared(1 every 1) + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: padShared(1 every 1) + padShared(%idx every %idx) + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] From 2be4ccccbcd1971da13173da3087ba8b8c56208e Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 16:21:48 -0500 Subject: [PATCH 11/29] Fix indentation --- mlir/test/Dialect/AMDGPU/ops.mlir | 46 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 36a4f1644c28a..0bc13e4256244 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -704,59 +704,59 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] - globalSize [0] + globalSize [0] // CHECK-SAME: globalStride [1] - globalStride [1] + globalStride [1] // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor - sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] - globalSize [0] + globalSize [0] // CHECK-SAME: globalStride [1] - globalStride [1] + globalStride [1] // CHECK-SAME: sharedSize [0] - sharedSize [0] + sharedSize [0] // CHECK-SAME: padShared(1 every 1) - padShared(1 every 1) - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + padShared(1 every 1) + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] - globalSize [0] + globalSize [0] // CHECK-SAME: globalStride [1] - globalStride [1] + globalStride [1] // CHECK-SAME: sharedSize [0] - sharedSize [0] + sharedSize [0] // CHECK-SAME: padShared(1 every 1) - padShared(%idx every %idx) - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + padShared(%idx every %idx) + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] - globalSize [0] + globalSize [0] // CHECK-SAME: globalStride [1] - globalStride [1] + globalStride [1] // CHECK-SAME: sharedSize [0] - sharedSize [0] + sharedSize [0] // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>) - atomicBarrier(%barrier [0] : memref<8xi32>) - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + atomicBarrier(%barrier [0] : memref<8xi32>) + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] - globalSize [0] + globalSize [0] // CHECK-SAME: globalStride [1] - globalStride [1] + globalStride [1] // CHECK-SAME: sharedSize [0] - sharedSize [0] - iterate %idx, %idx, %idx + sharedSize [0] // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + iterate %idx, %idx, %idx + : !amdgpu.tdm_base to !amdgpu.tdm_descriptor func.return From b3ba450d336c451b33685f67ddcd42e4a500d80c Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 17:10:42 -0500 Subject: [PATCH 12/29] Review --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 24 +++++++++++++++---- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 11 +++++---- mlir/test/Dialect/AMDGPU/invalid.mlir | 2 +- mlir/test/Dialect/AMDGPU/ops.mlir | 20 ++++++++-------- 4 files changed, 37 insertions(+), 20 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d33605220c442..981698a8d25e6 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1251,7 +1251,7 @@ def AMDGPU_MakeDmaBaseOp : }]; let assemblyFormat = [{ - $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results) + $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) }]; } @@ -1277,9 +1277,25 @@ def AMDGPU_MakeDmaDescriptorOp : Optional: $iteration_count)>, Results<(outs AMDGPU_TDMDescriptorType: $desc)> { - let summary = "TODO"; + let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS."; let description = [{ - TODO + Make all descriptor groups needed by tensor memory operations. + + The $base operand corresponds to the base pair addresses, one must be an address in LDS + while the other must be a global memory location. + + $global_{static/dynamic}_sizes determine the size of the tensor. + $global_{static/dynamic}_strides determine the strides of the tensor. + $shared_{static/dynamic}_sizes determines the size of the tile. + + Padding can be applied to the LDS address when copying from memory to LDS, + but not when copying from LDS to memory. + The values in the padded target addresses remain the same as before the operation was applied. + + 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. + $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. + $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. + $iterate_count determines how many times to iterate. }]; let assemblyFormat = [{ @@ -1292,7 +1308,7 @@ def AMDGPU_MakeDmaDescriptorOp : custom($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices) `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? - attr-dict `:` qualified(type($base)) `to` type(results) + attr-dict `:` qualified(type($base)) `->` type(results) }]; let hasVerifier = 1; diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index b382fec21f20a..6863dc4ad3e7f 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -54,12 +54,13 @@ static ParseResult parseDynamicIndex(OpAsmParser &parser, std::optional dynamicSize, IntegerAttr &staticSize) { - int64_t staticVal; + + int64_t staticVal = 0; if (parser.parseOptionalInteger(staticVal).has_value()) { staticSize = parser.getBuilder().getIndexAttr(staticVal); return success(); } - + return parser.parseOperand(dynamicSize.value()); } @@ -67,9 +68,9 @@ static void printDynamicIndex(OpAsmPrinter &printer, Operation *op, Value dynamicSize, IntegerAttr staticSize) { if (staticSize) { printer << staticSize.getValue(); - } else { - printer << dynamicSize; - } + return; + } + printer << dynamicSize; } void AMDGPUDialect::initialize() { diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index e8a0bfe9476a7..a72193d532ab9 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -361,6 +361,6 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 0bc13e4256244..2984bedac7bf5 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,11 +689,11 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base - amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base - amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } @@ -707,8 +707,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x globalSize [0] // CHECK-SAME: globalStride [1] globalStride [1] - // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor - sharedSize [0] : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base @@ -720,7 +720,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x sharedSize [0] // CHECK-SAME: padShared(1 every 1) padShared(1 every 1) - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base @@ -732,7 +732,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x sharedSize [0] // CHECK-SAME: padShared(1 every 1) padShared(%idx every %idx) - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base @@ -744,7 +744,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x sharedSize [0] // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>) atomicBarrier(%barrier [0] : memref<8xi32>) - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base @@ -756,7 +756,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x sharedSize [0] // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] iterate %idx, %idx, %idx - : !amdgpu.tdm_base to !amdgpu.tdm_descriptor + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return From d34c423efcaa0a655ed599e04d9ec56177270dcb Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 17:53:03 -0500 Subject: [PATCH 13/29] Fix parser --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 16 +++++++++++----- mlir/test/Dialect/AMDGPU/ops.mlir | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 6863dc4ad3e7f..f37ba43fcaa39 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -52,16 +52,22 @@ struct AMDGPUInlinerInterface final : DialectInlinerInterface { static ParseResult parseDynamicIndex(OpAsmParser &parser, - std::optional dynamicSize, + std::optional &dynamicSize, IntegerAttr &staticSize) { - int64_t staticVal = 0; - if (parser.parseOptionalInteger(staticVal).has_value()) { + int64_t staticVal; + OptionalParseResult parseResult = parser.parseOptionalInteger(staticVal); + if (parseResult.has_value()) { staticSize = parser.getBuilder().getIndexAttr(staticVal); return success(); } - - return parser.parseOperand(dynamicSize.value()); + + OpAsmParser::UnresolvedOperand operand = OpAsmParser::UnresolvedOperand{}; + if (parser.parseOperand(operand)) { + dynamicSize = operand; + return success(); + } + return failure(); } static void printDynamicIndex(OpAsmPrinter &printer, Operation *op, diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 2984bedac7bf5..923b30ce95363 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -730,7 +730,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x globalStride [1] // CHECK-SAME: sharedSize [0] sharedSize [0] - // CHECK-SAME: padShared(1 every 1) + // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) padShared(%idx every %idx) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor From cfb20cce8a1bec1bdb2bd72ac96920356d3e679d Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 17:56:36 -0500 Subject: [PATCH 14/29] whitespace --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index f37ba43fcaa39..2e9b198cb93f4 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -75,7 +75,7 @@ static void printDynamicIndex(OpAsmPrinter &printer, Operation *op, if (staticSize) { printer << staticSize.getValue(); return; - } + } printer << dynamicSize; } From 5e98ed07eaac6271fd78302c22664286448d56b9 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 24 Nov 2025 18:26:48 -0500 Subject: [PATCH 15/29] Fix parser --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 2e9b198cb93f4..765c4b4907952 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -63,11 +63,18 @@ parseDynamicIndex(OpAsmParser &parser, } OpAsmParser::UnresolvedOperand operand = OpAsmParser::UnresolvedOperand{}; - if (parser.parseOperand(operand)) { - dynamicSize = operand; + OptionalParseResult hasOperand = parser.parseOptionalOperand(operand); + if (!hasOperand.has_value()) { + dynamicSize = std::nullopt; return success(); } - return failure(); + + if (failed(hasOperand.value())) { + return failure(); + } + + dynamicSize = operand; + return success(); } static void printDynamicIndex(OpAsmPrinter &printer, Operation *op, From 5cca5f979201fd5838459feeec2487ebc9f6ea64 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 09:34:54 -0500 Subject: [PATCH 16/29] check if it is not empty --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 3 +++ mlir/test/Dialect/AMDGPU/invalid.mlir | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 765c4b4907952..cf6a136f34547 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -746,6 +746,9 @@ LogicalResult TransposeLoadOp::verify() { //===----------------------------------------------------------------------===// LogicalResult MakeDmaDescriptorOp::verify() { + if (getGlobalStaticStrides()->size() == 0) { + return emitOpError("strides must not be empty."); + } if (getGlobalStaticStrides()->back() != 1) { return emitOpError("strides for the innermost dimension must be 1."); } diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index a72193d532ab9..e5d4bfb152997 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -357,9 +357,19 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x // ----- -// CHECK-LABEL: func @make_dma_descriptor +// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) -func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base) { +func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return From 0f913f59149287c527747e825dce3f7f99cd5b97 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 09:59:47 -0500 Subject: [PATCH 17/29] less variables --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index cf6a136f34547..6e31efb897a47 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -55,9 +55,7 @@ parseDynamicIndex(OpAsmParser &parser, std::optional &dynamicSize, IntegerAttr &staticSize) { - int64_t staticVal; - OptionalParseResult parseResult = parser.parseOptionalInteger(staticVal); - if (parseResult.has_value()) { + if (int64_t staticVal; parser.parseOptionalInteger(staticVal).has_value()) { staticSize = parser.getBuilder().getIndexAttr(staticVal); return success(); } From adcbc32cf07d89093e1d1c8a0c77a27214108c77 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 09:59:57 -0500 Subject: [PATCH 18/29] mlir example --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 981698a8d25e6..d9de161c89051 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1248,6 +1248,31 @@ def AMDGPU_MakeDmaBaseOp : This operation creates a value corresponding roughly to the descriptor group 0 found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect. + For example: + + ```mlir + %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + + to + + ```mlir + // pseudocode + %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> + %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> + %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> + // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base + + // The base will be used when contructing dgroup0 + // when lowering amdgpu.make_dma_descriptor + %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> + %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... + + // When lowering amdgpu.tensor_load_to_lds + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + ``` }]; let assemblyFormat = [{ From 61fd94d85bfab34b038dc5f721e6c7cfa52d79c0 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 10:14:16 -0500 Subject: [PATCH 19/29] MLIR examples --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d9de161c89051..0d8aea35a6444 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1321,6 +1321,18 @@ def AMDGPU_MakeDmaDescriptorOp : $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. $iterate_count determines how many times to iterate. + + ```mlir + // Example of moving a two-dimensional tensor to LDS. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + + // Example of moving a two dimension tensor to LDS where padding is applied after every integer. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(1 every 1) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` }]; let assemblyFormat = [{ From 3de0f3c74911f53987badc071f75900b52b25f64 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 10:29:00 -0500 Subject: [PATCH 20/29] Use custom for indices. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 10 +++++++--- mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 0d8aea35a6444..e460f049a92eb 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1232,9 +1232,11 @@ def AMDGPU_MakeDmaBaseOp : AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, - Variadic:$srcIndices, + Variadic:$src_indices, + OptionalAttr: $src_indices_const, Arg:$dst, - Variadic:$dstIndices)>, + Variadic:$dst_indices, + OptionalAttr: $dst_indices_const)>, Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: @@ -1275,8 +1277,10 @@ def AMDGPU_MakeDmaBaseOp : ``` }]; + // TODO: Define a custom printer, parser to avoid space between $src/%dst and indices. let assemblyFormat = [{ - $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) + $src custom($src_indices, $src_indices_const) `,` + $dst custom($dst_indices, $dst_indices_const) attr-dict `:` type($src) `,` type($dst) `->` type(results) }]; } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 923b30ce95363..4b185b16b093e 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,10 +689,10 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]] [%[[IDX]]], %[[SMEM]] [%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]] [%[[IDX]]], %[[MEM]] [%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } From 0a70e24afe50184a6f2016afb670a3ce33c85f4a Mon Sep 17 00:00:00 2001 From: Erick Ochoa Lopez Date: Tue, 25 Nov 2025 10:41:34 -0500 Subject: [PATCH 21/29] Update mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp Co-authored-by: Jakub Kuderski --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 6e31efb897a47..9a0e5ac83e72c 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -744,7 +744,7 @@ LogicalResult TransposeLoadOp::verify() { //===----------------------------------------------------------------------===// LogicalResult MakeDmaDescriptorOp::verify() { - if (getGlobalStaticStrides()->size() == 0) { + if (getGlobalStaticStrides()->empty()) { return emitOpError("strides must not be empty."); } if (getGlobalStaticStrides()->back() != 1) { From 29072b82c8779e71307382b1717b06bdb6211b5f Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:16:03 -0500 Subject: [PATCH 22/29] Remove OptionalAttr from static indices --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 6 +++--- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d895a1fca00c9..98620f81ba040 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1291,11 +1291,11 @@ def AMDGPU_MakeDmaDescriptorOp : Arguments<(ins AMDGPU_TDMBaseType: $base, Variadic: $global_dynamic_sizes, - OptionalAttr: $global_static_sizes, + DenseI64ArrayAttr: $global_static_sizes, Variadic: $global_dynamic_strides, - OptionalAttr: $global_static_strides, + DenseI64ArrayAttr: $global_static_strides, Variadic: $shared_dynamic_sizes, - OptionalAttr: $shared_static_sizes, + DenseI64ArrayAttr: $shared_static_sizes, Optional: $pad, OptionalAttr: $pad_const, Optional: $every, diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 9a0e5ac83e72c..bbee889293f84 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -744,10 +744,10 @@ LogicalResult TransposeLoadOp::verify() { //===----------------------------------------------------------------------===// LogicalResult MakeDmaDescriptorOp::verify() { - if (getGlobalStaticStrides()->empty()) { + if (getGlobalStaticStrides().empty()) { return emitOpError("strides must not be empty."); } - if (getGlobalStaticStrides()->back() != 1) { + if (getGlobalStaticStrides().back() != 1) { return emitOpError("strides for the innermost dimension must be 1."); } return success(); From 50e76d4f02ae1b5b08e0530b00e0096f31dfc7db Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:20:58 -0500 Subject: [PATCH 23/29] Atomic barrier only takes Variadic --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 6 ++---- mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 98620f81ba040..7add6d971109c 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1301,8 +1301,7 @@ def AMDGPU_MakeDmaDescriptorOp : Optional: $every, OptionalAttr: $every_const, Optional: $atomic_barrier_address, - Variadic: $atomic_barrier_dynamic_indices, - OptionalAttr: $atomic_barrier_static_indices, + Variadic: $atomic_barrier_indices, Optional: $global_increment, Optional: $lds_increment, Optional: $iteration_count)>, @@ -1347,8 +1346,7 @@ def AMDGPU_MakeDmaDescriptorOp : `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) ( `padShared` `(` custom($pad, $pad_const)^ `every` custom($every, $every_const) `)` )? - ( `atomicBarrier` `(` $atomic_barrier_address^ - custom($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices) + ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? attr-dict `:` qualified(type($base)) `->` type(results) diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 617c375ca0c15..f2c2c2b43a4e0 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -742,8 +742,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x globalStride [1] // CHECK-SAME: sharedSize [0] sharedSize [0] - // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>) - atomicBarrier(%barrier [0] : memref<8xi32>) + // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) + atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] From b339c7af115874c10677a52d3b00a38dbb58c29b Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:25:58 -0500 Subject: [PATCH 24/29] Do not use attribute splitting --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 +--- mlir/test/Dialect/AMDGPU/ops.mlir | 12 ------------ 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 7add6d971109c..754f6c008d461 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1297,9 +1297,7 @@ def AMDGPU_MakeDmaDescriptorOp : Variadic: $shared_dynamic_sizes, DenseI64ArrayAttr: $shared_static_sizes, Optional: $pad, - OptionalAttr: $pad_const, Optional: $every, - OptionalAttr: $every_const, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_indices, Optional: $global_increment, @@ -1345,7 +1343,7 @@ def AMDGPU_MakeDmaDescriptorOp : `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) - ( `padShared` `(` custom($pad, $pad_const)^ `every` custom($every, $every_const) `)` )? + ( `padShared` `(` $pad^ `every` $every `)` )? ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index f2c2c2b43a4e0..8983fb96729f9 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -710,18 +710,6 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor - // CHECK: amdgpu.make_dma_descriptor %[[BASE]] - amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] - // CHECK-SAME: padShared(1 every 1) - padShared(1 every 1) - : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor - // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base // CHECK-SAME: globalSize [0] From fb82ac38e14718541003b90b86b8cf35e623bcac Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:29:06 -0500 Subject: [PATCH 25/29] Rename $every to $pad_every. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 754f6c008d461..17d9ee37b8d51 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1297,7 +1297,7 @@ def AMDGPU_MakeDmaDescriptorOp : Variadic: $shared_dynamic_sizes, DenseI64ArrayAttr: $shared_static_sizes, Optional: $pad, - Optional: $every, + Optional: $pad_every, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_indices, Optional: $global_increment, @@ -1333,7 +1333,7 @@ def AMDGPU_MakeDmaDescriptorOp : // Example of moving a two dimension tensor to LDS where padding is applied after every integer. %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base - %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(1 every 1) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor ``` }]; @@ -1343,7 +1343,7 @@ def AMDGPU_MakeDmaDescriptorOp : `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) - ( `padShared` `(` $pad^ `every` $every `)` )? + ( `padShared` `(` $pad^ `every` $pad_every `)` )? ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? From 445f96ee820de3fbb6f5e1439c774308d28177c3 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:31:03 -0500 Subject: [PATCH 26/29] Remove unused functions --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 34 -------------------- 1 file changed, 34 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index bbee889293f84..e94aac3686896 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -50,40 +50,6 @@ struct AMDGPUInlinerInterface final : DialectInlinerInterface { }; } // namespace -static ParseResult -parseDynamicIndex(OpAsmParser &parser, - std::optional &dynamicSize, - IntegerAttr &staticSize) { - - if (int64_t staticVal; parser.parseOptionalInteger(staticVal).has_value()) { - staticSize = parser.getBuilder().getIndexAttr(staticVal); - return success(); - } - - OpAsmParser::UnresolvedOperand operand = OpAsmParser::UnresolvedOperand{}; - OptionalParseResult hasOperand = parser.parseOptionalOperand(operand); - if (!hasOperand.has_value()) { - dynamicSize = std::nullopt; - return success(); - } - - if (failed(hasOperand.value())) { - return failure(); - } - - dynamicSize = operand; - return success(); -} - -static void printDynamicIndex(OpAsmPrinter &printer, Operation *op, - Value dynamicSize, IntegerAttr staticSize) { - if (staticSize) { - printer << staticSize.getValue(); - return; - } - printer << dynamicSize; -} - void AMDGPUDialect::initialize() { addOperations< #define GET_OP_LIST From e022322bc903843346ce9c228f44f360eff800da Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:39:44 -0500 Subject: [PATCH 27/29] Only use dynamic indices in make_dma_base --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 8 ++------ mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 17d9ee37b8d51..3581b07dc4e3e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1232,10 +1232,8 @@ def AMDGPU_MakeDmaBaseOp : Arguments<(ins Arg:$src, Variadic:$src_indices, - OptionalAttr: $src_indices_const, Arg:$dst, - Variadic:$dst_indices, - OptionalAttr: $dst_indices_const)>, + Variadic:$dst_indices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: @@ -1279,10 +1277,8 @@ def AMDGPU_MakeDmaBaseOp : These tensor DMA operations were introduced in gfx1250. }]; - // TODO: Define a custom printer, parser to avoid space between $src/%dst and indices. let assemblyFormat = [{ - $src custom($src_indices, $src_indices_const) `,` - $dst custom($dst_indices, $dst_indices_const) attr-dict `:` type($src) `,` type($dst) `->` type(results) + $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) }]; } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 8983fb96729f9..a8af06dc5ff0a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,10 +689,10 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]] [%[[IDX]]], %[[SMEM]] [%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]] [%[[IDX]]], %[[MEM]] [%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } From 850d6d0f3b7e460c5566afd679eebf21544a5902 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:49:16 -0500 Subject: [PATCH 28/29] Add verification for same rank --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 12 ++++++++++-- mlir/test/Dialect/AMDGPU/invalid.mlir | 13 ++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index e94aac3686896..f0d9e1a8020be 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -710,12 +710,20 @@ LogicalResult TransposeLoadOp::verify() { //===----------------------------------------------------------------------===// LogicalResult MakeDmaDescriptorOp::verify() { - if (getGlobalStaticStrides().empty()) { + ArrayRef globalStaticStrides = getGlobalStaticStrides(); + + if (globalStaticStrides.empty()) { return emitOpError("strides must not be empty."); } - if (getGlobalStaticStrides().back() != 1) { + if (globalStaticStrides.back() != 1) { return emitOpError("strides for the innermost dimension must be 1."); } + + ArrayRef globalStaticSizes = getGlobalStaticSizes(); + if (globalStaticSizes.size() != globalStaticStrides.size()) { + return emitOpError("strides and sizes must have same rank."); + } + return success(); } diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index e5d4bfb152997..193e0c22adae6 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -371,6 +371,17 @@ func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} + amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + From a8fbe1abf8690392ee5e774a2118f6404bdadec5 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 15:59:04 -0500 Subject: [PATCH 29/29] Verify tile and tensor's rank are the same --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 8 +++++++- mlir/test/Dialect/AMDGPU/invalid.mlir | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index f0d9e1a8020be..5ff640b5d1596 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -720,10 +720,16 @@ LogicalResult MakeDmaDescriptorOp::verify() { } ArrayRef globalStaticSizes = getGlobalStaticSizes(); - if (globalStaticSizes.size() != globalStaticStrides.size()) { + size_t rank = globalStaticSizes.size(); + if (rank != globalStaticStrides.size()) { return emitOpError("strides and sizes must have same rank."); } + ArrayRef sharedStaticSizes = getSharedStaticSizes(); + if (rank != sharedStaticSizes.size()) { + return emitOpError("tensor must have same rank as tile."); + } + return success(); } diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 193e0c22adae6..066f46060f62f 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -385,3 +385,12 @@ func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_ func.return } +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +}