Skip to content

Commit 7753f61

Browse files
authored
[AMDGPU] Support cluster_load_async_to_lds instructions on gfx1250 (#156595)
1 parent 8c716be commit 7753f61

16 files changed

+689
-2
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,10 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx
670670
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
671671
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
672672
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
673+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b8, "vc*1c*3IiIii", "nc", "gfx1250-insts,wavefrontsize32")
674+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b32, "vi*1i*3IiIii", "nc", "gfx1250-insts,wavefrontsize32")
675+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b64, "vV2i*1V2i*3IiIii", "nc", "gfx1250-insts,wavefrontsize32")
676+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_async_to_lds_b128, "vV4i*1V4i*3IiIii", "nc", "gfx1250-insts,wavefrontsize32")
673677
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
674678
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
675679
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,46 @@
55
typedef int v2i __attribute__((ext_vector_type(2)));
66
typedef int v4i __attribute__((ext_vector_type(4)));
77

8+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b8(
9+
// CHECK-GFX1250-NEXT: entry:
10+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
11+
// CHECK-GFX1250-NEXT: ret void
12+
//
13+
void test_amdgcn_cluster_load_async_to_lds_b8(global char* gaddr, local char* laddr, int mask)
14+
{
15+
__builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr, laddr, 16, 0, mask);
16+
}
17+
18+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b32(
19+
// CHECK-GFX1250-NEXT: entry:
20+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
21+
// CHECK-GFX1250-NEXT: ret void
22+
//
23+
void test_amdgcn_cluster_load_async_to_lds_b32(global int* gaddr, local int* laddr, int mask)
24+
{
25+
__builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr, laddr, 16, 0, mask);
26+
}
27+
28+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b64(
29+
// CHECK-GFX1250-NEXT: entry:
30+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
31+
// CHECK-GFX1250-NEXT: ret void
32+
//
33+
void test_amdgcn_cluster_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr, int mask)
34+
{
35+
__builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr, laddr, 16, 0, mask);
36+
}
37+
38+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b128(
39+
// CHECK-GFX1250-NEXT: entry:
40+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]])
41+
// CHECK-GFX1250-NEXT: ret void
42+
//
43+
void test_amdgcn_cluster_load_async_to_lds_b128(global v4i* gaddr, local v4i* laddr, int mask)
44+
{
45+
__builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr, laddr, 16, 0, mask);
46+
}
47+
848
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8(
949
// CHECK-GFX1250-NEXT: entry:
1050
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i
113113
void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
114114
local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
115115
{
116+
__builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr8, laddr8, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b8' must be a constant integer}}
117+
__builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr32, laddr32, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b32' must be a constant integer}}
118+
__builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr64, laddr64, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b64' must be a constant integer}}
119+
__builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr128, laddr128, offset, 0, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b128' must be a constant integer}}
120+
116121
__builtin_amdgcn_global_load_async_to_lds_b8(gaddr8, laddr8, offset, 0); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b8' must be a constant integer}}
117122
__builtin_amdgcn_global_load_async_to_lds_b32(gaddr32, laddr32, offset, 0); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b32' must be a constant integer}}
118123
__builtin_amdgcn_global_load_async_to_lds_b64(gaddr64, laddr64, offset, 0); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b64' must be a constant integer}}
@@ -127,6 +132,11 @@ void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *ga
127132
void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
128133
local int *laddr32, local v2i* laddr64, local v4i* laddr128, int cpol, int mask)
129134
{
135+
__builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr8, laddr8, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b8' must be a constant integer}}
136+
__builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr32, laddr32, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b32' must be a constant integer}}
137+
__builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr64, laddr64, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b64' must be a constant integer}}
138+
__builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr128, laddr128, 16, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_async_to_lds_b128' must be a constant integer}}
139+
130140
__builtin_amdgcn_global_load_async_to_lds_b8(gaddr8, laddr8, 16, cpol); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b8' must be a constant integer}}
131141
__builtin_amdgcn_global_load_async_to_lds_b32(gaddr32, laddr32, 16, cpol); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b32' must be a constant integer}}
132142
__builtin_amdgcn_global_load_async_to_lds_b64(gaddr64, laddr64, 16, cpol); // expected-error {{'__builtin_amdgcn_global_load_async_to_lds_b64' must be a constant integer}}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3815,6 +3815,21 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
38153815
[IntrNoMem, IntrSpeculatable]
38163816
>;
38173817

3818+
// Async instructions increment ASYNCcnt which is modeled as InaccessibleMem.
3819+
class AMDGPUAsyncClusterLoadLDS : Intrinsic <
3820+
[],
3821+
[global_ptr_ty, // Base global pointer to load from
3822+
local_ptr_ty, // LDS base pointer to store to
3823+
llvm_i32_ty, // offset
3824+
llvm_i32_ty, // gfx12+ cachepolicy:
3825+
// bits [0-2] = th
3826+
// bits [3-4] = scope
3827+
llvm_i32_ty], // workgroup broadcast mask (to M0)
3828+
[IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
3829+
NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
3830+
"", [SDNPMemOperand]
3831+
>;
3832+
38183833
class AMDGPUAsyncGlobalLoadToLDS : Intrinsic <
38193834
[],
38203835
[global_ptr_ty, // Base global pointer to load from
@@ -3841,6 +3856,15 @@ class AMDGPUAsyncGlobalStoreFromLDS : Intrinsic <
38413856
"", [SDNPMemOperand]
38423857
>;
38433858

3859+
def int_amdgcn_cluster_load_async_to_lds_b8 :
3860+
ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b8">, AMDGPUAsyncClusterLoadLDS;
3861+
def int_amdgcn_cluster_load_async_to_lds_b32 :
3862+
ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b32">, AMDGPUAsyncClusterLoadLDS;
3863+
def int_amdgcn_cluster_load_async_to_lds_b64 :
3864+
ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b64">, AMDGPUAsyncClusterLoadLDS;
3865+
def int_amdgcn_cluster_load_async_to_lds_b128 :
3866+
ClangBuiltin<"__builtin_amdgcn_cluster_load_async_to_lds_b128">, AMDGPUAsyncClusterLoadLDS;
3867+
38443868
def int_amdgcn_global_load_async_to_lds_b8 :
38453869
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b8">, AMDGPUAsyncGlobalLoadToLDS;
38463870
def int_amdgcn_global_load_async_to_lds_b32 :

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ def gi_global_saddr_glc :
137137
def gi_global_saddr_no_ioffset :
138138
GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
139139
GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
140+
def gi_global_saddr_no_ioffset_m0 :
141+
GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffsetM0">,
142+
GIComplexPatternEquiv<GlobalSAddrNoIOffsetM0>;
140143

141144
def gi_mubuf_scratch_offset :
142145
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,6 +2137,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
21372137
return true;
21382138
}
21392139

2140+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2141+
SDValue &SAddr,
2142+
SDValue &VOffset,
2143+
SDValue &CPol) const {
2144+
bool ScaleOffset;
2145+
SDValue DummyOffset;
2146+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2147+
false))
2148+
return false;
2149+
2150+
// We are assuming CPol is second from last operand of the intrinsic.
2151+
auto PassedCPol =
2152+
N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2153+
CPol = CurDAG->getTargetConstant(
2154+
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2155+
return true;
2156+
}
2157+
21402158
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
21412159
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
21422160
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
179179
SDValue &CPol) const;
180180
bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
181181
SDValue &VOffset, SDValue &CPol) const;
182+
bool SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr,
183+
SDValue &VOffset, SDValue &CPol) const;
182184
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
183185
SDValue &Offset) const;
184186
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5734,6 +5734,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
57345734
return selectGlobalSAddr(Root, PassedCPol, false);
57355735
}
57365736

5737+
InstructionSelector::ComplexRendererFns
5738+
AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5739+
MachineOperand &Root) const {
5740+
const MachineInstr &I = *Root.getParent();
5741+
5742+
// We are assuming CPol is second from last operand of the intrinsic.
5743+
auto PassedCPol =
5744+
I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5745+
return selectGlobalSAddr(Root, PassedCPol, false);
5746+
}
5747+
57375748
InstructionSelector::ComplexRendererFns
57385749
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
57395750
Register Addr = Root.getReg();

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
261261
selectGlobalSAddrGLC(MachineOperand &Root) const;
262262
InstructionSelector::ComplexRendererFns
263263
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
264+
InstructionSelector::ComplexRendererFns
265+
selectGlobalSAddrNoIOffsetM0(MachineOperand &Root) const;
264266

265267
InstructionSelector::ComplexRendererFns
266268
selectScratchSAddr(MachineOperand &Root) const;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3322,6 +3322,14 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33223322
constrainOpWithReadfirstlane(B, MI, 6); // soffset
33233323
return;
33243324
}
3325+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
3326+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
3327+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
3328+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
3329+
applyDefaultMapping(OpdMapper);
3330+
constrainOpWithReadfirstlane(B, MI, 5);
3331+
return;
3332+
}
33253333
case Intrinsic::amdgcn_load_to_lds:
33263334
case Intrinsic::amdgcn_global_load_lds: {
33273335
applyDefaultMapping(OpdMapper);
@@ -5483,6 +5491,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
54835491
OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32);
54845492
break;
54855493
}
5494+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
5495+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
5496+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
5497+
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
5498+
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5499+
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5500+
unsigned M0Bank =
5501+
getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID);
5502+
OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32);
5503+
break;
5504+
}
54865505
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
54875506
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
54885507
case Intrinsic::amdgcn_global_store_async_from_lds_b64:

0 commit comments

Comments
 (0)