Skip to content

Commit 976aa3b

Browse files
committed
[AMDGPU] Generalize global.load.lds to buffer fat pointers
Direct load to LDS can also be implemented on buffer fat pointers, using the pointer as the offset to raw.buffer.ptr.load.lds. This commit generalizes the existing intrinsic to support this usage.
1 parent 441f879 commit 976aa3b

File tree

3 files changed

+47
-3
lines changed

3 files changed

+47
-3
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2624,17 +2624,20 @@ def int_amdgcn_perm :
26242624
// GFX9 Intrinsics
26252625
//===----------------------------------------------------------------------===//
26262626

2627+
// Intrinsic for loading data from a global-memory pointer to LDS
2628+
// Also supports buffer fat pointers.
26272629
class AMDGPUGlobalLoadLDS :
26282630
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
26292631
Intrinsic <
26302632
[],
2631-
[LLVMQualPointerType<1>, // Base global pointer to load from
2632-
LLVMQualPointerType<3>, // LDS base pointer to store to
2633+
[llvm_anyptr_ty, // Global or buffer fat pointer to load from (per-lane)
2634+
LLVMQualPointerType<3>, // LDS base pointer to store to (uniform)
26332635
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
26342636
llvm_i32_ty, // imm offset (applied to both global and LDS address)
26352637
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
26362638
// bit 1 = sc1,
26372639
// bit 4 = scc))
2640+
// See raw_ptr_buffer_load_lds for semantics on ptr addrspace(7)
26382641
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
26392642
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
26402643
"", [SDNPMemOperand]>;

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
21672167
case Intrinsic::memset:
21682168
case Intrinsic::memset_inline:
21692169
case Intrinsic::experimental_memset_pattern:
2170+
case Intrinsic::amdgcn_global_load_lds:
21702171
return true;
21712172
}
21722173
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
22552256
SplitUsers.insert(&I);
22562257
return {NewRsrc, Off};
22572258
}
2259+
case Intrinsic::amdgcn_global_load_lds: {
2260+
Value *Ptr = I.getArgOperand(0);
2261+
if (!isSplitFatPtr(Ptr->getType()))
2262+
return {nullptr, nullptr};
2263+
IRB.SetInsertPoint(&I);
2264+
auto [Rsrc, Off] = getPtrParts(Ptr);
2265+
Value *LDSPtr = I.getArgOperand(1);
2266+
Value *LoadSize = I.getArgOperand(2);
2267+
Value *ImmOff = I.getArgOperand(3);
2268+
Value *Aux = I.getArgOperand(4);
2269+
Value *SOffset = IRB.getInt32(0);
2270+
Instruction *NewLoad = IRB.CreateIntrinsic(
2271+
Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
2272+
{Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
2273+
copyMetadata(NewLoad, &I);
2274+
SplitUsers.insert(&I);
2275+
I.replaceAllUsesWith(NewLoad);
2276+
return {nullptr, nullptr};
2277+
}
22582278
}
22592279
return {nullptr, nullptr};
22602280
}
@@ -2291,7 +2311,10 @@ class AMDGPULowerBufferFatPointers : public ModulePass {
22912311
public:
22922312
static char ID;
22932313

2294-
AMDGPULowerBufferFatPointers() : ModulePass(ID) {}
2314+
AMDGPULowerBufferFatPointers() : ModulePass(ID) {
2315+
initializeAMDGPULowerBufferFatPointersPass(
2316+
*PassRegistry::getPassRegistry());
2317+
}
22952318

22962319
bool run(Module &M, const TargetMachine &TM);
22972320
bool runOnModule(Module &M) override;

llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1724,3 +1724,21 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng
17241724
call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false)
17251725
ret void
17261726
}
1727+
1728+
;;; Buffer load to LDS
1729+
1730+
declare void @llvm.amdgcn.global.load.lds.p7(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg)
1731+
1732+
define void @llvm_amdgcn_global_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) {;
1733+
; CHECK-LABEL: define void @llvm_amdgcn_global_load_lds(
1734+
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
1735+
; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0
1736+
; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1
1737+
; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]]
1738+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0)
1739+
; CHECK-NEXT: ret void
1740+
;
1741+
%q = getelementptr i8, ptr addrspace(7) %p, i32 %idx
1742+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0)
1743+
ret void
1744+
}

0 commit comments

Comments
 (0)