Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,82 @@ def ROCDL_GlobalLoadLDSOp :
}];
}

//===---------------------------------------------------------------------===//
// Tensor load/store intrinsics (available in GFX1250)
//===---------------------------------------------------------------------===//

def ROCDL_TensorLoadToLDSIntrOp :
ROCDL_IntrOp<"tensor.load.to.lds", [], [], [], 0, 0, 1, 0, [4], ["cachePolicy"]> {
dag args = (ins Arg<LLVM_VectorOf<I32>, "", [MemRead]>:$dgroup0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if we want MemRead here, same as MemWrite below.
Here is what I can find:

  • the source and dest addresses are encoded in d#group 0 (as per manual), group 1,2,3 are other encodings irrelevant of memory addresses.
  • underlying intrinsic has IntrInaccessibleMemOrArgMemOnly

I think memory properties are best associated with the ROCDL intrinsic instead of the individual arguments. @justinrosner

Copy link
Contributor Author

@justinrosner justinrosner Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to do something like the following instead then?

  • Add the MemoryEffectOpInterface to each class
  • Override the getEffects method with something like:
void $cppClass::getEffects(
    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
  auto inaccessible = mlir::LLVM::InaccessibleMemoryResource::get();
  // These intrinsics read and write inaccessible memory (LDS).
  effects.emplace_back(mlir::MemoryEffects::Read::get(),  inaccessible, mlir::Value());
  effects.emplace_back(mlir::MemoryEffects::Write::get(), inaccessible, mlir::Value());
}

Does this give a better matching with what the underlying LLVM intrinsic is doing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I quickly looked around, it seems the best we can do is to use RecursiveMemoryEffects to represent this op.

So it could be like:

ROCDL_IntrOp<"tensor.load.to.lds", [], [], [RecursiveMemoryEffects]...

Let me know what you think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought that RecursiveMemoryEffects was meant for ops that contain regions and want their memory behavior to be the union of the effects of the nested ops? The new ops have a 1:1 mapping with the intrinsics, so they are not going to have any regions. So wouldn't marking them as recursive have no effect here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe here is a design question: per the manual, TENSOR_LOAD_TO_LDS will be decomposed into a series of GLOBAL_LOAD_ASYNC_TO_LDS internally as it is using TDM. So it is intrinsically doing something more.

But looking around, I don't see other ROCDL ops that works on mem actually have related memory traits. So I guess it is okay to skip it.

Arg<LLVM_VectorOf<I32>, "", [MemRead]>:$dgroup1,
Arg<LLVM_VectorOf<I32>, "", [MemRead]>:$dgroup2,
Arg<LLVM_VectorOf<I32>, "", [MemRead]>:$dgroup3,
I32Attr:$cachePolicy);
let arguments = !con(args, baseArgs);
let assemblyFormat = [{
$dgroup0 `,` $dgroup1 `,` $dgroup2 `,` $dgroup3 `,` $cachePolicy
attr-dict `:` type($dgroup0) `,` type($dgroup1) `,` type($dgroup2) `,` type($dgroup3)
}];
let extraClassDefinition = [{
SmallVector<Value> $cppClass::getAccessedOperands() {
return {getDgroup0(), getDgroup1(), getDgroup2(), getDgroup3()};
}
}];
}

def ROCDL_TensorStoreFromLDSIntrOp :
ROCDL_IntrOp<"tensor.store.from.lds", [], [], [], 0, 0, 1, 0, [4], ["cachePolicy"]> {
dag args = (ins Arg<LLVM_VectorOf<I32>, "", [MemWrite]>:$dgroup0,
Arg<LLVM_VectorOf<I32>, "", [MemWrite]>:$dgroup1,
Arg<LLVM_VectorOf<I32>, "", [MemWrite]>:$dgroup2,
Arg<LLVM_VectorOf<I32>, "", [MemWrite]>:$dgroup3,
I32Attr:$cachePolicy);
let arguments = !con(args, baseArgs);
let assemblyFormat = [{
$dgroup0 `,` $dgroup1 `,` $dgroup2 `,` $dgroup3 `,` $cachePolicy
attr-dict `:` type($dgroup0) `,` type($dgroup1) `,` type($dgroup2) `,` type($dgroup3)
}];
let extraClassDefinition = [{
SmallVector<Value> $cppClass::getAccessedOperands() {
return {getDgroup0(), getDgroup1(), getDgroup2(), getDgroup3()};
}
}];
}

def ROCDL_TensorLoadToLDSIntrD2Op :
ROCDL_IntrOp<"tensor.load.to.lds.d2", [], [], [], 0, 0, 1, 0, [2], ["cachePolicy"]> {
dag args = (ins Arg<LLVM_VectorOf<I32>, "", [MemRead]>:$dgroup0,
Arg<LLVM_VectorOf<I32>, "", [MemRead]>:$dgroup1,
I32Attr:$cachePolicy);
let arguments = !con(args, baseArgs);
let assemblyFormat = [{
$dgroup0 `,` $dgroup1 `,` $cachePolicy
attr-dict `:` type($dgroup0) `,` type($dgroup1)
}];
let extraClassDefinition = [{
SmallVector<Value> $cppClass::getAccessedOperands() {
return {getDgroup0(), getDgroup1()};
}
}];
}

def ROCDL_TensorStoreFromLDSIntrD2Op :
ROCDL_IntrOp<"tensor.store.from.lds.d2", [], [], [], 0, 0, 1, 0, [2], ["cachePolicy"]> {
dag args = (ins Arg<LLVM_VectorOf<I32>, "", [MemWrite]>:$dgroup0,
Arg<LLVM_VectorOf<I32>, "", [MemWrite]>:$dgroup1,
I32Attr:$cachePolicy);
let arguments = !con(args, baseArgs);
let assemblyFormat = [{
$dgroup0 `,` $dgroup1 `,` $cachePolicy
attr-dict `:` type($dgroup0) `,` type($dgroup1)
}];
let extraClassDefinition = [{
SmallVector<Value> $cppClass::getAccessedOperands() {
return {getDgroup0(), getDgroup1()};
}
}];
}

//===---------------------------------------------------------------------===//
// Operations on raw buffer resources (stride of 0, bounds checks either off or in
// raw buffer mode).
Expand Down
30 changes: 30 additions & 0 deletions mlir/test/Dialect/LLVMIR/rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}

llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
%dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
// CHECK-LABEL @rocdl.tensor.load.to.lds
// CHECK: rocdl.tensor.load.to.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, 0 : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32>
rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3, 0 : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32>
llvm.return
}

llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
%dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
// CHECK-LABEL @rocdl.tensor.store.from.lds
// CHECK: rocdl.tensor.store.from.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, 0 : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32>
rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3, 0 : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32>
llvm.return
}

llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
// CHECK-LABEL @rocdl.tensor.load.to.lds.d2
// CHECK: rocdl.tensor.load.to.lds.d2 %{{.*}}, %{{.*}}, 0 : vector<4xi32>, vector<8xi32>
rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1, 0 : vector<4xi32>, vector<8xi32>
llvm.return
}

llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
// CHECK-LABEL @rocdl.tensor.store.from.lds.d2
// CHECK: rocdl.tensor.store.from.lds.d2 %{{.*}}, %{{.*}}, 0 : vector<4xi32>, vector<8xi32>
rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1, 0 : vector<4xi32>, vector<8xi32>
llvm.return
}

llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
%stride : i16,
%numRecords : i64,
Expand Down
30 changes: 30 additions & 0 deletions mlir/test/Target/LLVMIR/rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}

llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
%dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
// CHECK-LABEL: rocdl.tensor.load.to.lds
// CHECK: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0)
rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3, 0 : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32>
llvm.return
}

llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
%dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
// CHECK-LABEL: rocdl.tensor.store.from.lds
// CHECK: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0)
rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3, 0 : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32>
llvm.return
}

llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
// CHECK-LABEL: rocdl.tensor.load.to.lds.d2
// CHECK: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0)
rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1, 0 : vector<4xi32>, vector<8xi32>
llvm.return
}

llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
// CHECK-LABEL: rocdl.tensor.store.from.lds.d2
// CHECK: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0)
rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1, 0 : vector<4xi32>, vector<8xi32>
llvm.return
}

llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
%stride : i16,
%numRecords : i64,
Expand Down