Skip to content

Commit aaca255

Browse files
author
git apple-llvm automerger
committed
Merge commit '0461cd3d1d6f' from llvm.org/main into next
2 parents 6ce64d4 + 0461cd3 commit aaca255

File tree

10 files changed

+1842
-3
lines changed

10 files changed

+1842
-3
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2671,6 +2671,18 @@ def int_amdgcn_cs_chain:
26712671
],
26722672
[IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
26732673

2674+
// Run a function with all the lanes enabled. Only direct calls are allowed. The
2675+
// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
2676+
// calling convention and must not be variadic. The remaining arguments to the
2677+
// callee are taken from the arguments passed to the intrinsic. Lanes that are
2678+
// inactive at the point of the call will receive poison. The return value is
2679+
// the return value of the callee for the active lanes (there is no return
2680+
// value in the inactive ones).
2681+
def int_amdgcn_call_whole_wave:
2682+
Intrinsic<[llvm_any_ty], // The return type of the callee.
2683+
[llvm_anyptr_ty, // The callee.
2684+
llvm_vararg_ty], // The arguments to the callee.
2685+
[IntrConvergent]>;
26742686

26752687
//===----------------------------------------------------------------------===//
26762688
// CI+ Intrinsics

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2570,6 +2570,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
25702570
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
25712571
return true;
25722572
case Intrinsic::amdgcn_cs_chain:
2573+
case Intrinsic::amdgcn_call_whole_wave:
25732574
return translateCallBase(CI, MIRBuilder);
25742575
case Intrinsic::fptrunc_round: {
25752576
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8016,6 +8016,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
80168016
HasTailCall = true;
80178017
return;
80188018
}
8019+
case Intrinsic::amdgcn_call_whole_wave: {
8020+
TargetLowering::ArgListTy Args;
8021+
8022+
// The first argument is the callee. Skip it when assembling the call args.
8023+
TargetLowering::ArgListEntry Arg;
8024+
for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
8025+
Arg.Node = getValue(I.getArgOperand(Idx));
8026+
Arg.Ty = I.getArgOperand(Idx)->getType();
8027+
Arg.setAttributes(&I, Idx);
8028+
Args.push_back(Arg);
8029+
}
8030+
8031+
SDValue ConvControlToken;
8032+
if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
8033+
auto *Token = Bundle->Inputs[0].get();
8034+
ConvControlToken = getValue(Token);
8035+
}
8036+
8037+
TargetLowering::CallLoweringInfo CLI(DAG);
8038+
CLI.setDebugLoc(getCurSDLoc())
8039+
.setChain(getRoot())
8040+
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
8041+
getValue(I.getArgOperand(0)), std::move(Args))
8042+
.setTailCall(false)
8043+
.setIsPreallocated(
8044+
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
8045+
.setConvergent(I.isConvergent())
8046+
.setConvergenceControlToken(ConvControlToken);
8047+
CLI.CB = &I;
8048+
8049+
std::pair<SDValue, SDValue> Result =
8050+
lowerInvokable(CLI, /*EHPadBB=*/nullptr);
8051+
8052+
if (Result.first.getNode())
8053+
setValue(&I, Result.first);
8054+
return;
8055+
}
80198056
case Intrinsic::ptrmask: {
80208057
SDValue Ptr = getValue(I.getOperand(0));
80218058
SDValue Mask = getValue(I.getOperand(1));

llvm/lib/IR/Verifier.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6716,6 +6716,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
67166716
"Value for inactive lanes must be a VGPR function argument", &Call);
67176717
break;
67186718
}
6719+
case Intrinsic::amdgcn_call_whole_wave: {
6720+
auto F = dyn_cast<Function>(Call.getArgOperand(0));
6721+
Check(F, "Indirect whole wave calls are not allowed", &Call);
6722+
6723+
CallingConv::ID CC = F->getCallingConv();
6724+
Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
6725+
"Callee must have the amdgpu_gfx_whole_wave calling convention",
6726+
&Call);
6727+
6728+
Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
6729+
6730+
Check(Call.arg_size() == F->arg_size(),
6731+
"Call argument count must match callee argument count", &Call);
6732+
6733+
// The first argument of the call is the callee, and the first argument of
6734+
// the callee is the active mask. The rest of the arguments must match.
6735+
Check(F->arg_begin()->getType()->isIntegerTy(1),
6736+
"Callee must have i1 as its first argument", &Call);
6737+
for (auto [CallArg, FuncArg] :
6738+
drop_begin(zip_equal(Call.args(), F->args()))) {
6739+
Check(CallArg->getType() == FuncArg.getType(),
6740+
"Argument types must match", &Call);
6741+
6742+
// Check that inreg attributes match between call site and function
6743+
Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
6744+
FuncArg.hasInRegAttr(),
6745+
"Argument inreg attributes must match", &Call);
6746+
}
6747+
break;
6748+
}
67196749
case Intrinsic::amdgcn_s_prefetch_data: {
67206750
Check(
67216751
AMDGPU::isFlatGlobalAddrSpace(

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1464,9 +1464,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
14641464
CallLoweringInfo &Info) const {
14651465
if (Function *F = Info.CB->getCalledFunction())
14661466
if (F->isIntrinsic()) {
1467-
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1468-
"Unexpected intrinsic");
1469-
return lowerChainCall(MIRBuilder, Info);
1467+
switch (F->getIntrinsicID()) {
1468+
case Intrinsic::amdgcn_cs_chain:
1469+
return lowerChainCall(MIRBuilder, Info);
1470+
case Intrinsic::amdgcn_call_whole_wave:
1471+
Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1472+
1473+
// Get the callee from the original instruction, so it doesn't look like
1474+
// this is an indirect call.
1475+
Info.Callee = MachineOperand::CreateGA(
1476+
cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
1477+
Info.OrigArgs.erase(Info.OrigArgs.begin());
1478+
Info.IsVarArg = false;
1479+
break;
1480+
default:
1481+
llvm_unreachable("Unexpected intrinsic call");
1482+
}
14701483
}
14711484

14721485
if (Info.IsVarArg) {
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
4+
5+
declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
6+
7+
define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
8+
; DAGISEL-LABEL: basic_test:
9+
; DAGISEL: ; %bb.0:
10+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
11+
; DAGISEL-NEXT: s_wait_expcnt 0x0
12+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
13+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
14+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
15+
; DAGISEL-NEXT: s_mov_b32 s0, s33
16+
; DAGISEL-NEXT: s_mov_b32 s33, s32
17+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
18+
; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
19+
; DAGISEL-NEXT: s_wait_alu 0xfffe
20+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
21+
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
22+
; DAGISEL-NEXT: s_clause 0x1
23+
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
24+
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
25+
; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
26+
; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
27+
; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
28+
; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
29+
; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
30+
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
31+
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
32+
; DAGISEL-NEXT: s_wait_alu 0xfffe
33+
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
34+
; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
35+
; DAGISEL-NEXT: s_clause 0x1
36+
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
37+
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
38+
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
39+
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
40+
; DAGISEL-NEXT: s_mov_b32 s32, s33
41+
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
42+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
43+
; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
44+
; DAGISEL-NEXT: s_wait_alu 0xfffe
45+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
46+
; DAGISEL-NEXT: s_mov_b32 s33, s0
47+
; DAGISEL-NEXT: s_wait_loadcnt 0x0
48+
; DAGISEL-NEXT: s_wait_alu 0xfffe
49+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
50+
;
51+
; GISEL-LABEL: basic_test:
52+
; GISEL: ; %bb.0:
53+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
54+
; GISEL-NEXT: s_wait_expcnt 0x0
55+
; GISEL-NEXT: s_wait_samplecnt 0x0
56+
; GISEL-NEXT: s_wait_bvhcnt 0x0
57+
; GISEL-NEXT: s_wait_kmcnt 0x0
58+
; GISEL-NEXT: s_mov_b32 s0, s33
59+
; GISEL-NEXT: s_mov_b32 s33, s32
60+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
61+
; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
62+
; GISEL-NEXT: s_wait_alu 0xfffe
63+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
64+
; GISEL-NEXT: v_writelane_b32 v42, s0, 2
65+
; GISEL-NEXT: s_clause 0x1
66+
; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
67+
; GISEL-NEXT: scratch_store_b32 off, v41, s33
68+
; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
69+
; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
70+
; GISEL-NEXT: v_writelane_b32 v42, s30, 0
71+
; GISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
72+
; GISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
73+
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
74+
; GISEL-NEXT: v_writelane_b32 v42, s31, 1
75+
; GISEL-NEXT: s_wait_alu 0xfffe
76+
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
77+
; GISEL-NEXT: global_store_b32 v[40:41], v0, off
78+
; GISEL-NEXT: s_clause 0x1
79+
; GISEL-NEXT: scratch_load_b32 v41, off, s33
80+
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
81+
; GISEL-NEXT: v_readlane_b32 s31, v42, 1
82+
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
83+
; GISEL-NEXT: s_mov_b32 s32, s33
84+
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
85+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
86+
; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
87+
; GISEL-NEXT: s_wait_alu 0xfffe
88+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
89+
; GISEL-NEXT: s_mov_b32 s33, s0
90+
; GISEL-NEXT: s_wait_loadcnt 0x0
91+
; GISEL-NEXT: s_wait_alu 0xfffe
92+
; GISEL-NEXT: s_setpc_b64 s[30:31]
93+
%y = add i32 %x, 13
94+
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
95+
store i32 %ret, ptr addrspace(1) %ptr
96+
ret void
97+
}
98+
99+
declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
100+
101+
define amdgpu_gfx void @ret_void(i32 %x) {
102+
; DAGISEL-LABEL: ret_void:
103+
; DAGISEL: ; %bb.0:
104+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
105+
; DAGISEL-NEXT: s_wait_expcnt 0x0
106+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
107+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
108+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
109+
; DAGISEL-NEXT: s_mov_b32 s0, s33
110+
; DAGISEL-NEXT: s_mov_b32 s33, s32
111+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
112+
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
113+
; DAGISEL-NEXT: s_wait_alu 0xfffe
114+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
115+
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
116+
; DAGISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
117+
; DAGISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
118+
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
119+
; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
120+
; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
121+
; DAGISEL-NEXT: s_wait_alu 0xfffe
122+
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
123+
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
124+
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
125+
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
126+
; DAGISEL-NEXT: s_mov_b32 s32, s33
127+
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
128+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
129+
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
130+
; DAGISEL-NEXT: s_wait_alu 0xfffe
131+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
132+
; DAGISEL-NEXT: s_mov_b32 s33, s0
133+
; DAGISEL-NEXT: s_wait_loadcnt 0x0
134+
; DAGISEL-NEXT: s_wait_alu 0xfffe
135+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
136+
;
137+
; GISEL-LABEL: ret_void:
138+
; GISEL: ; %bb.0:
139+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
140+
; GISEL-NEXT: s_wait_expcnt 0x0
141+
; GISEL-NEXT: s_wait_samplecnt 0x0
142+
; GISEL-NEXT: s_wait_bvhcnt 0x0
143+
; GISEL-NEXT: s_wait_kmcnt 0x0
144+
; GISEL-NEXT: s_mov_b32 s0, s33
145+
; GISEL-NEXT: s_mov_b32 s33, s32
146+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
147+
; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
148+
; GISEL-NEXT: s_wait_alu 0xfffe
149+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
150+
; GISEL-NEXT: v_writelane_b32 v40, s0, 2
151+
; GISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
152+
; GISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
153+
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
154+
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
155+
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
156+
; GISEL-NEXT: s_wait_alu 0xfffe
157+
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
158+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
159+
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
160+
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
161+
; GISEL-NEXT: s_mov_b32 s32, s33
162+
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
163+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
164+
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
165+
; GISEL-NEXT: s_wait_alu 0xfffe
166+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
167+
; GISEL-NEXT: s_mov_b32 s33, s0
168+
; GISEL-NEXT: s_wait_loadcnt 0x0
169+
; GISEL-NEXT: s_wait_alu 0xfffe
170+
; GISEL-NEXT: s_setpc_b64 s[30:31]
171+
call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
172+
ret void
173+
}
174+

llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
101101
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
102102
ret i64 %ret
103103
}
104+
105+
declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
106+
107+
; Make sure we don't pass the first argument (i1).
108+
define amdgpu_cs void @call(i32 %x, ptr %p) {
109+
; CHECK-LABEL: name: call
110+
; CHECK: bb.1 (%ir-block.0):
111+
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
112+
; CHECK-NEXT: {{ $}}
113+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
114+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
115+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
116+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
117+
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
118+
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
119+
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
120+
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
121+
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
122+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
123+
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
124+
; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
125+
; CHECK-NEXT: S_ENDPGM 0
126+
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
127+
store i32 %ret, ptr %p
128+
ret void
129+
}

0 commit comments

Comments
 (0)