Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,39 @@ def ROCDL_GlobalLoadLDSOp :
}];
}

//===---------------------------------------------------------------------===//
// Async load to LDS intrinsic (available in GFX1250)
//===---------------------------------------------------------------------===//

class ROCDL_GlobalLoadAsyncToLDSOp<string mnemonic> :
ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
I32Attr:$offset,
I32Attr:$aux);
let arguments = !con(args, baseArgs);
let assemblyFormat = [{
$globalPtr `,` $ldsPtr `,` $offset `,` $aux
attr-dict `:` type($globalPtr)
}];
let description = [{
Loads data asynchronously from a global memory pointer to a local data
store (LDS) pointer.

Available on gfx1250+.
}];
let extraClassDefinition = [{
::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
return {getGlobalPtr(), getLdsPtr()};
}
}];
}

def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seconding Ravil's nits here.

def ROCDL_GlobalLoadAsyncToLDSB32Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b32">;
def ROCDL_GlobalLoadAsyncToLDSB64Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b64">;
def ROCDL_GlobalLoadAsyncToLDSB128Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b128">;
Copy link
Contributor

@ravil-mobile ravil-mobile Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: Maybe we can use foreach construct in tablegen?

foreach bytes = [8,  32,  64, 128] in {
  let bytesStr = "b" # !cast<string>(bytes) in
    def ROCDL_GlobalLoadAsyncToLDS # !toupper(bytesStr) # Op :
      ROCDL_IntrOp<"global.load.async.to.lds." # bytesStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
      dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
                     Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
                     I32Attr:$offset,
                     I32Attr:$aux);
      let arguments = !con(args, baseArgs);
      let assemblyFormat = [{
        $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
        attr-dict `:` type($globalPtr)
      }];
      let description = [{
        Asynchronously loads # bytes # bytes of data from a global memory to a Local Data
        Store (LDS).

        Available on gfx1250+.
      }];
      let extraClassDefinition = [{
        ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
          return {getGlobalPtr(), getLdsPtr()};
        }
      }];
  }
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, that's some cool tablegen black magic I didn't know about!

I have used foreach as you suggested, but I changed it slightly, the iterator should be bits, not bytes, and the description was not showing the right string so I also fixed that.

I have checked the generated documentation and it correctly shows the 4 ops with the right description. What I don't like is that they are not generated in order, (first goes b128, then b32, then b64, then b8).


//===---------------------------------------------------------------------===//
// Tensor load/store intrinsics (available in GFX1250)
//===---------------------------------------------------------------------===//
Expand Down
13 changes: 13 additions & 0 deletions mlir/test/Dialect/LLVMIR/rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}

llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK-LABEL @rocdl.global.load.async.to.lds
// CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
// CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
// CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
// CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>
rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>
rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>
rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>
llvm.return
}

// CHECK-LABEL @rocdl.tensor.load.to.lds
llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
%dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
Expand Down
24 changes: 24 additions & 0 deletions mlir/test/Target/LLVMIR/rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,30 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}

llvm.func @rocdl.global.load.async.lds.b8(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>
llvm.return
}

llvm.func @rocdl.global.load.async.lds.b32(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>
llvm.return
}

llvm.func @rocdl.global.load.async.lds.b64(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>
llvm.return
}

llvm.func @rocdl.global.load.async.lds.b128(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>
llvm.return
}

// CHECK-LABEL: rocdl.tensor.load.to.lds
llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
%dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
Expand Down