Skip to content

Commit bcb72e3

Browse files
committed
[AMDGPU] Add a new amdgcn.load.to.lds intrinsic
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the saem API and "gather from a pointer to LDS" is something of an abstract operation. This commet adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang.
1 parent ff8fc5b commit bcb72e3

File tree

21 files changed

+598
-53
lines changed

21 files changed

+598
-53
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
257257
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
258258
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
259259
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
260+
TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
260261
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
261262

262263
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
564564
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
565565
return Builder.CreateCall(F, {Addr});
566566
}
567+
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
568+
return emitBuiltinWithOneOverloadedType<5>(*this, E,
569+
Intrinsic::amdgcn_load_to_lds);
570+
}
567571
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
568572
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
569573
{llvm::Type::getInt64Ty(getLLVMContext())});

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
3636

3737
switch (BuiltinID) {
3838
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
39+
case AMDGPU::BI__builtin_amdgcn_load_to_lds:
3940
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
4041
constexpr const int SizeIdx = 2;
4142
llvm::APSInt Size;

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
17661766
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
17671767
}
17681768

1769+
// CHECK-LABEL: @test_load_to_lds_96(
1770+
// CHECK-NEXT: entry:
1771+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1772+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1773+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1774+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1775+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1776+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1777+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
1778+
// CHECK-NEXT: ret void
1779+
//
1780+
void test_load_to_lds_96(global void* src, local void *dst) {
1781+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
1782+
}
1783+
1784+
// CHECK-LABEL: @test_load_to_lds_128(
1785+
// CHECK-NEXT: entry:
1786+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1787+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1788+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1789+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1790+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1791+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1792+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
1793+
// CHECK-NEXT: ret void
1794+
//
1795+
void test_load_to_lds_128(global void* src, local void *dst) {
1796+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
1797+
}
1798+
17691799
// CHECK-LABEL: @test_global_load_lds_96(
17701800
// CHECK-NEXT: entry:
17711801
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
4+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
5+
// REQUIRES: amdgpu-registered-target
6+
7+
typedef unsigned int u32;
8+
typedef unsigned short u16;
9+
typedef unsigned char u8;
10+
11+
// CHECK-LABEL: @test_load_to_lds_u32(
12+
// CHECK-NEXT: entry:
13+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
14+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
15+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
16+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
17+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
18+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
19+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
20+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
21+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
22+
// CHECK-NEXT: ret void
23+
//
24+
void test_load_to_lds_u32(global u32* src, local u32 *dst) {
25+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
26+
}
27+
28+
// CHECK-LABEL: @test_load_to_lds_u16(
29+
// CHECK-NEXT: entry:
30+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
31+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
32+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
33+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
34+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
35+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
36+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
37+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
38+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
39+
// CHECK-NEXT: ret void
40+
//
41+
void test_load_to_lds_u16(global u16* src, local u16 *dst) {
42+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
43+
}
44+
45+
// CHECK-LABEL: @test_load_to_lds_u8(
46+
// CHECK-NEXT: entry:
47+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
48+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
49+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
50+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
51+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
52+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
53+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
54+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
55+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
56+
// CHECK-NEXT: ret void
57+
//
58+
void test_load_to_lds_u8(global u8* src, local u8 *dst) {
59+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
60+
}

llvm/docs/ReleaseNotes.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,14 @@ Changes to the AMDGPU Backend
103103

104104
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
105105

106+
* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
107+
intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
108+
(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
109+
represented in the IR without needing to use buffer resource intrinsics directly.
110+
This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
111+
buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
112+
optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
113+
106114
Changes to the ARM Backend
107115
--------------------------
108116

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2641,6 +2641,27 @@ def int_amdgcn_perm :
26412641
// GFX9 Intrinsics
26422642
//===----------------------------------------------------------------------===//
26432643

2644+
/// This is a general-purpose intrinsic for all operations that take a pointer
2645+
/// a base location in LDS, and a data size and use it to perform a gather to LDS.
2646+
/// This allows abstracting over both global pointers (address space 1) and
2647+
/// the buffer-resource-wrapper pointers (address space 7 and 9).
2648+
/// TODO: add support for address space 5 and scratch_load_lds.
2649+
class AMDGPULoadToLDS :
2650+
ClangBuiltin<"__builtin_amdgcn_load_to_lds">,
2651+
Intrinsic <
2652+
[],
2653+
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
2654+
LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
2655+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
2656+
llvm_i32_ty, // imm offset (applied to both input and LDS address)
2657+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2658+
// bit 1 = sc1,
2659+
// bit 4 = scc))
2660+
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2661+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2662+
"", [SDNPMemOperand]>;
2663+
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
2664+
26442665
class AMDGPUGlobalLoadLDS :
26452666
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
26462667
Intrinsic <

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23352335
case Intrinsic::amdgcn_struct_buffer_load_lds:
23362336
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
23372337
return selectBufferLoadLds(I);
2338+
// Until we can store both the address space of the global and the LDS
2339+
// arguments by having tto MachineMemOperands on an intrinsic, we just trust
2340+
// that the argument is a global pointer (buffer pointers have been handled by
2341+
// a LLVM IR-level lowering).
2342+
case Intrinsic::amdgcn_load_to_lds:
23382343
case Intrinsic::amdgcn_global_load_lds:
23392344
return selectGlobalLoadLds(I);
23402345
case Intrinsic::amdgcn_exp_compr:

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2151,6 +2151,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
21512151
case Intrinsic::memset:
21522152
case Intrinsic::memset_inline:
21532153
case Intrinsic::experimental_memset_pattern:
2154+
case Intrinsic::amdgcn_load_to_lds:
21542155
return true;
21552156
}
21562157
}
@@ -2239,6 +2240,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
22392240
SplitUsers.insert(&I);
22402241
return {NewRsrc, Off};
22412242
}
2243+
case Intrinsic::amdgcn_load_to_lds: {
2244+
Value *Ptr = I.getArgOperand(0);
2245+
if (!isSplitFatPtr(Ptr->getType()))
2246+
return {nullptr, nullptr};
2247+
IRB.SetInsertPoint(&I);
2248+
auto [Rsrc, Off] = getPtrParts(Ptr);
2249+
Value *LDSPtr = I.getArgOperand(1);
2250+
Value *LoadSize = I.getArgOperand(2);
2251+
Value *ImmOff = I.getArgOperand(3);
2252+
Value *Aux = I.getArgOperand(4);
2253+
Value *SOffset = IRB.getInt32(0);
2254+
Instruction *NewLoad = IRB.CreateIntrinsic(
2255+
Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
2256+
{Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
2257+
copyMetadata(NewLoad, &I);
2258+
SplitUsers.insert(&I);
2259+
I.replaceAllUsesWith(NewLoad);
2260+
return {nullptr, nullptr};
2261+
}
22422262
}
22432263
return {nullptr, nullptr};
22442264
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33123312
constrainOpWithReadfirstlane(B, MI, 6); // soffset
33133313
return;
33143314
}
3315+
case Intrinsic::amdgcn_load_to_lds:
33153316
case Intrinsic::amdgcn_global_load_lds: {
33163317
applyDefaultMapping(OpdMapper);
33173318
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5273,6 +5274,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52735274
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
52745275
break;
52755276
}
5277+
case Intrinsic::amdgcn_load_to_lds:
52765278
case Intrinsic::amdgcn_global_load_lds: {
52775279
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
52785280
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

0 commit comments

Comments
 (0)