Skip to content

Commit d3d1d8f

Browse files
authored
[AMDGPU] Support cluster load instructions for gfx1250 (#156548)
1 parent 0dc1b16 commit d3d1d8f

File tree

16 files changed

+503
-0
lines changed

16 files changed

+503
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,9 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
667667
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
668668
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
669669
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
670+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
671+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
672+
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
670673
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
671674
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
672675
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
675675
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
676676
return Builder.CreateCall(F, {Addr, Val});
677677
}
678+
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
679+
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
680+
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
681+
Intrinsic::ID IID;
682+
switch (BuiltinID) {
683+
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
684+
IID = Intrinsic::amdgcn_cluster_load_b32;
685+
break;
686+
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
687+
IID = Intrinsic::amdgcn_cluster_load_b64;
688+
break;
689+
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
690+
IID = Intrinsic::amdgcn_cluster_load_b128;
691+
break;
692+
}
693+
SmallVector<Value *, 3> Args;
694+
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
695+
Args.push_back(EmitScalarExpr(E->getArg(i)));
696+
llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
697+
return Builder.CreateCall(F, {Args});
698+
}
678699
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
679700
// Should this have asan instrumentation?
680701
return emitBuiltinWithOneOverloadedType<5>(*this, E,
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
4+
5+
typedef int v2i __attribute__((ext_vector_type(2)));
6+
typedef int v4i __attribute__((ext_vector_type(4)));
7+
8+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b32(
9+
// CHECK-GFX1250-NEXT: entry:
10+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cluster.load.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 10, i32 [[MASK:%.*]])
11+
// CHECK-GFX1250-NEXT: ret i32 [[TMP0]]
12+
//
13+
int test_amdgcn_cluster_load_b32(global int* inptr, int mask)
14+
{
15+
return __builtin_amdgcn_cluster_load_b32(inptr, 10, mask);
16+
}
17+
18+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b64(
19+
// CHECK-GFX1250-NEXT: entry:
20+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 22, i32 [[MASK:%.*]])
21+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
22+
//
23+
v2i test_amdgcn_cluster_load_b64(global v2i* inptr, int mask)
24+
{
25+
return __builtin_amdgcn_cluster_load_b64(inptr, 22, mask);
26+
}
27+
28+
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b128(
29+
// CHECK-GFX1250-NEXT: entry:
30+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 27, i32 [[MASK:%.*]])
31+
// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]]
32+
//
33+
v4i test_amdgcn_cluster_load_b128(global v4i* inptr, int mask)
34+
{
35+
return __builtin_amdgcn_cluster_load_b128(inptr, 27, mask);
36+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,13 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global
103103
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
104104
}
105105

106+
void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i* addr128, global int* b32out, global v2i* b64out, global v4i* b128out, int cpol, int mask)
107+
{
108+
*b32out = __builtin_amdgcn_cluster_load_b32(addr32, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b32' must be a constant integer}}
109+
*b64out = __builtin_amdgcn_cluster_load_b64(addr64, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b64' must be a constant integer}}
110+
*b128out = __builtin_amdgcn_cluster_load_b128(addr128, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b128' must be a constant integer}}
111+
}
112+
106113
void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
107114
local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
108115
{

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4113,6 +4113,23 @@ def int_amdgcn_tensor_load_to_lds_d2 :
41134113
def int_amdgcn_tensor_store_from_lds_d2 :
41144114
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
41154115

4116+
class AMDGPUClusterLoad<LLVMType ptr_ty>:
4117+
Intrinsic<
4118+
[llvm_any_ty],
4119+
[ptr_ty,
4120+
llvm_i32_ty, // gfx12+ cachepolicy:
4121+
// bits [0-2] = th
4122+
// bits [3-4] = scope
4123+
llvm_i32_ty], // workgroup broadcast mask (in M0)
4124+
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, IntrWillReturn, IntrConvergent,
4125+
NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
4126+
"", [SDNPMemOperand]
4127+
>;
4128+
4129+
def int_amdgcn_cluster_load_b32 : AMDGPUClusterLoad<global_ptr_ty>;
4130+
def int_amdgcn_cluster_load_b64 : AMDGPUClusterLoad<global_ptr_ty>;
4131+
def int_amdgcn_cluster_load_b128 : AMDGPUClusterLoad<global_ptr_ty>;
4132+
41164133
class AMDGPULoadMonitor<LLVMType ptr_ty>:
41174134
Intrinsic<
41184135
[llvm_any_ty],

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ def gi_global_saddr :
128128
def gi_global_saddr_cpol :
129129
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
130130
GIComplexPatternEquiv<GlobalSAddrCPol>;
131+
def gi_global_saddr_cpol_m0 :
132+
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">,
133+
GIComplexPatternEquiv<GlobalSAddrCPolM0>;
131134
def gi_global_saddr_glc :
132135
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
133136
GIComplexPatternEquiv<GlobalSAddrGLC>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
20892089
return true;
20902090
}
20912091

2092+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2093+
SDValue &SAddr,
2094+
SDValue &VOffset,
2095+
SDValue &Offset,
2096+
SDValue &CPol) const {
2097+
bool ScaleOffset;
2098+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2099+
return false;
2100+
2101+
// We are assuming CPol is second from last operand of the intrinsic.
2102+
auto PassedCPol =
2103+
N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2104+
CPol = CurDAG->getTargetConstant(
2105+
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2106+
return true;
2107+
}
2108+
20922109
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
20932110
SDValue &SAddr, SDValue &VOffset,
20942111
SDValue &Offset,

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
171171
bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
172172
SDValue &VOffset, SDValue &Offset,
173173
SDValue &CPol) const;
174+
bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr,
175+
SDValue &VOffset, SDValue &Offset,
176+
SDValue &CPol) const;
174177
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
175178
SDValue &VOffset, SDValue &Offset,
176179
SDValue &CPol) const;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5708,6 +5708,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
57085708
return selectGlobalSAddr(Root, PassedCPol);
57095709
}
57105710

5711+
InstructionSelector::ComplexRendererFns
5712+
AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5713+
const MachineInstr &I = *Root.getParent();
5714+
5715+
// We are assuming CPol is second from last operand of the intrinsic.
5716+
auto PassedCPol =
5717+
I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5718+
return selectGlobalSAddr(Root, PassedCPol);
5719+
}
5720+
57115721
InstructionSelector::ComplexRendererFns
57125722
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
57135723
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
256256
InstructionSelector::ComplexRendererFns
257257
selectGlobalSAddrCPol(MachineOperand &Root) const;
258258
InstructionSelector::ComplexRendererFns
259+
selectGlobalSAddrCPolM0(MachineOperand &Root) const;
260+
InstructionSelector::ComplexRendererFns
259261
selectGlobalSAddrGLC(MachineOperand &Root) const;
260262
InstructionSelector::ComplexRendererFns
261263
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;

0 commit comments

Comments
 (0)