-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][rocdl] Add GlobalLoadAsyncToLDS operation #165374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][rocdl] Add GlobalLoadAsyncToLDS operation #165374
Conversation
|
@llvm/pr-subscribers-mlir Author: Pablo Antonio Martinez (pabloantoniom) ChangesAdds This is available on gfx1250+ Full diff: https://github.com/llvm/llvm-project/pull/165374.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index d2df244eb9363..3fcbbe52748f5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -663,6 +663,39 @@ def ROCDL_GlobalLoadLDSOp :
}];
}
+//===---------------------------------------------------------------------===//
+// Async load to LDS intrinsic (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+class ROCDL_GlobalLoadAsyncToLDSOp<string mnemonic> :
+ ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+ dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
+ Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+ I32Attr:$offset,
+ I32Attr:$aux);
+ let arguments = !con(args, baseArgs);
+ let assemblyFormat = [{
+ $globalPtr `,` $ldsPtr `,` $offset `,` $aux
+ attr-dict `:` type($globalPtr)
+ }];
+ let description = [{
+ Loads data asynchronously from a global memory pointer to a local data
+ store (LDS) pointer.
+
+ Available on gfx1250+.
+ }];
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getGlobalPtr(), getLdsPtr()};
+ }
+ }];
+}
+
+def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">;
+def ROCDL_GlobalLoadAsyncToLDSB32Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b32">;
+def ROCDL_GlobalLoadAsyncToLDSB64Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b64">;
+def ROCDL_GlobalLoadAsyncToLDSB128Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b128">;
+
//===---------------------------------------------------------------------===//
// Operations on raw buffer resources (stride of 0, bounds checks either off or in
// raw buffer mode).
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index d270ee8b089aa..47464abd610f9 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK-LABEL @rocdl.global.load.async.to.lds
+ // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
+ // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
+ // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
+ // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
+ rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>
+ rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>
+ rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>
+ rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>
+ llvm.return
+}
+
llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
%stride : i16,
%numRecords : i64,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 30126f6bff05a..5ae9f11360df4 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,30 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.global.load.async.lds.b8(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
+ rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b32(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
+ rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b64(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
+ rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b128(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
+ rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
%stride : i16,
%numRecords : i64,
|
|
@llvm/pr-subscribers-mlir-llvm Author: Pablo Antonio Martinez (pabloantoniom) ChangesAdds This is available on gfx1250+ Full diff: https://github.com/llvm/llvm-project/pull/165374.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index d2df244eb9363..3fcbbe52748f5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -663,6 +663,39 @@ def ROCDL_GlobalLoadLDSOp :
}];
}
+//===---------------------------------------------------------------------===//
+// Async load to LDS intrinsic (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+class ROCDL_GlobalLoadAsyncToLDSOp<string mnemonic> :
+ ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+ dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
+ Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+ I32Attr:$offset,
+ I32Attr:$aux);
+ let arguments = !con(args, baseArgs);
+ let assemblyFormat = [{
+ $globalPtr `,` $ldsPtr `,` $offset `,` $aux
+ attr-dict `:` type($globalPtr)
+ }];
+ let description = [{
+ Loads data asynchronously from a global memory pointer to a local data
+ store (LDS) pointer.
+
+ Available on gfx1250+.
+ }];
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getGlobalPtr(), getLdsPtr()};
+ }
+ }];
+}
+
+def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">;
+def ROCDL_GlobalLoadAsyncToLDSB32Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b32">;
+def ROCDL_GlobalLoadAsyncToLDSB64Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b64">;
+def ROCDL_GlobalLoadAsyncToLDSB128Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b128">;
+
//===---------------------------------------------------------------------===//
// Operations on raw buffer resources (stride of 0, bounds checks either off or in
// raw buffer mode).
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index d270ee8b089aa..47464abd610f9 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK-LABEL @rocdl.global.load.async.to.lds
+ // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
+ // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
+ // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
+ // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
+ rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>
+ rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>
+ rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>
+ rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>
+ llvm.return
+}
+
llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
%stride : i16,
%numRecords : i64,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 30126f6bff05a..5ae9f11360df4 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,30 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.global.load.async.lds.b8(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
+ rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b32(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
+ rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b64(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
+ rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b128(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
+ rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>
+ llvm.return
+}
+
llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
%stride : i16,
%numRecords : i64,
|
kuhar
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM but let's wait for @krzysz00 to confirm
lialan
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Adds `global.load.async.to.lds` op to rocdl, supporting `b8`, `b32`, `b64` and `b128`. The op is lowered to the appropiate `llvm.amdgcn.global.load.async.to.lds.bXX` intrinsic. This is available on gfx1250+.
0a27dd6 to
fe4f87b
Compare
| class ROCDL_GlobalLoadAsyncToLDSOp<string mnemonic> : | ||
| ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> { | ||
| dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr, | ||
| Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr, | ||
| I32Attr:$offset, | ||
| I32Attr:$aux); | ||
| let arguments = !con(args, baseArgs); | ||
| let assemblyFormat = [{ | ||
| $globalPtr `,` $ldsPtr `,` $offset `,` $aux | ||
| attr-dict `:` type($globalPtr) | ||
| }]; | ||
| let description = [{ | ||
| Loads data asynchronously from a global memory pointer to a local data | ||
| store (LDS) pointer. | ||
|
|
||
| Available on gfx1250+. | ||
| }]; | ||
| let extraClassDefinition = [{ | ||
| ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { | ||
| return {getGlobalPtr(), getLdsPtr()}; | ||
| } | ||
| }]; | ||
| } | ||
|
|
||
| def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">; | ||
| def ROCDL_GlobalLoadAsyncToLDSB32Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b32">; | ||
| def ROCDL_GlobalLoadAsyncToLDSB64Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b64">; | ||
| def ROCDL_GlobalLoadAsyncToLDSB128Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b128">; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NIT: Maybe we can use foreach construct in tablegen?
foreach bytes = [8, 32, 64, 128] in {
let bytesStr = "b" # !cast<string>(bytes) in
def ROCDL_GlobalLoadAsyncToLDS # !toupper(bytesStr) # Op :
ROCDL_IntrOp<"global.load.async.to.lds." # bytesStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
I32Attr:$offset,
I32Attr:$aux);
let arguments = !con(args, baseArgs);
let assemblyFormat = [{
$globalPtr `,` $ldsPtr `,` $offset `,` $aux
attr-dict `:` type($globalPtr)
}];
let description = [{
Asynchronously loads # bytes # bytes of data from a global memory to a Local Data
Store (LDS).
Available on gfx1250+.
}];
let extraClassDefinition = [{
::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
return {getGlobalPtr(), getLdsPtr()};
}
}];
}
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow, that's some cool tablegen black magic I didn't know about!
I have used foreach as you suggested, but I changed it slightly, the iterator should be bits, not bytes, and the description was not showing the right string so I also fixed that.
I have checked the generated documentation and it correctly shows the 4 ops with the right description. What I don't like is that they are not generated in order, (first goes b128, then b32, then b64, then b8).
| }]; | ||
| } | ||
|
|
||
| def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seconding Ravil's nits here.
|
Any further suggestions, or are you guys happy with the current state? @ravil-mobile @krzysz00 |
LGTM |
krzysz00
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor note re assembly format, otherwise approved
| let arguments = !con(args, baseArgs); | ||
| let assemblyFormat = [{ | ||
| $globalPtr `,` $ldsPtr `,` $offset `,` $aux | ||
| attr-dict `:` type($globalPtr) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd go for type($globalPtr), type($ldsPtr) if we're doing this sort of thing
Adds
global.load.async.to.ldsop to rocdl, supportingb8,b32,b64andb128. The op is lowered to the appropiatellvm.amdgcn.global.load.async.to.lds.bXXintrinsic.This is available on gfx1250+