Skip to content

Commit 0461cd3

Browse files
authored
[AMDGPU] Intrinsic for launching whole wave functions (llvm#145859)
Add the llvm.amdgcn.call.whole.wave intrinsic for calling whole wave functions. This will take as its first argument the callee with the amdgpu_gfx_whole_wave calling convention, followed by the call parameters which must match the signature of the callee except for the first function argument (the i1 original EXEC mask, which doesn't need to be passed in). Indirect calls are not allowed. Make direct calls to amdgpu_gfx_whole_wave functions a verifier error. Unspeakable horrors happen around calls from whole wave functions, the plan is to improve the handling of caller/callee-saved registers in a future patch. Tail calls are also handled in a future patch.
1 parent ff5fa71 commit 0461cd3

File tree

10 files changed

+1842
-3
lines changed

10 files changed

+1842
-3
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2671,6 +2671,18 @@ def int_amdgcn_cs_chain:
26712671
],
26722672
[IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
26732673

2674+
// Run a function with all the lanes enabled. Only direct calls are allowed. The
2675+
// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
2676+
// calling convention and must not be variadic. The remaining arguments to the
2677+
// callee are taken from the arguments passed to the intrinsic. Lanes that are
2678+
// inactive at the point of the call will receive poison. The return value is
2679+
// the return value of the callee for the active lanes (there is no return
2680+
// value in the inactive ones).
2681+
def int_amdgcn_call_whole_wave:
2682+
Intrinsic<[llvm_any_ty], // The return type of the callee.
2683+
[llvm_anyptr_ty, // The callee.
2684+
llvm_vararg_ty], // The arguments to the callee.
2685+
[IntrConvergent]>;
26742686

26752687
//===----------------------------------------------------------------------===//
26762688
// CI+ Intrinsics

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,6 +2556,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
25562556
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
25572557
return true;
25582558
case Intrinsic::amdgcn_cs_chain:
2559+
case Intrinsic::amdgcn_call_whole_wave:
25592560
return translateCallBase(CI, MIRBuilder);
25602561
case Intrinsic::fptrunc_round: {
25612562
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7984,6 +7984,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
79847984
HasTailCall = true;
79857985
return;
79867986
}
7987+
case Intrinsic::amdgcn_call_whole_wave: {
7988+
TargetLowering::ArgListTy Args;
7989+
7990+
// The first argument is the callee. Skip it when assembling the call args.
7991+
TargetLowering::ArgListEntry Arg;
7992+
for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
7993+
Arg.Node = getValue(I.getArgOperand(Idx));
7994+
Arg.Ty = I.getArgOperand(Idx)->getType();
7995+
Arg.setAttributes(&I, Idx);
7996+
Args.push_back(Arg);
7997+
}
7998+
7999+
SDValue ConvControlToken;
8000+
if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
8001+
auto *Token = Bundle->Inputs[0].get();
8002+
ConvControlToken = getValue(Token);
8003+
}
8004+
8005+
TargetLowering::CallLoweringInfo CLI(DAG);
8006+
CLI.setDebugLoc(getCurSDLoc())
8007+
.setChain(getRoot())
8008+
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
8009+
getValue(I.getArgOperand(0)), std::move(Args))
8010+
.setTailCall(false)
8011+
.setIsPreallocated(
8012+
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
8013+
.setConvergent(I.isConvergent())
8014+
.setConvergenceControlToken(ConvControlToken);
8015+
CLI.CB = &I;
8016+
8017+
std::pair<SDValue, SDValue> Result =
8018+
lowerInvokable(CLI, /*EHPadBB=*/nullptr);
8019+
8020+
if (Result.first.getNode())
8021+
setValue(&I, Result.first);
8022+
return;
8023+
}
79878024
case Intrinsic::ptrmask: {
79888025
SDValue Ptr = getValue(I.getOperand(0));
79898026
SDValue Mask = getValue(I.getOperand(1));

llvm/lib/IR/Verifier.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6612,6 +6612,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
66126612
"Value for inactive lanes must be a VGPR function argument", &Call);
66136613
break;
66146614
}
6615+
case Intrinsic::amdgcn_call_whole_wave: {
6616+
auto F = dyn_cast<Function>(Call.getArgOperand(0));
6617+
Check(F, "Indirect whole wave calls are not allowed", &Call);
6618+
6619+
CallingConv::ID CC = F->getCallingConv();
6620+
Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
6621+
"Callee must have the amdgpu_gfx_whole_wave calling convention",
6622+
&Call);
6623+
6624+
Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
6625+
6626+
Check(Call.arg_size() == F->arg_size(),
6627+
"Call argument count must match callee argument count", &Call);
6628+
6629+
// The first argument of the call is the callee, and the first argument of
6630+
// the callee is the active mask. The rest of the arguments must match.
6631+
Check(F->arg_begin()->getType()->isIntegerTy(1),
6632+
"Callee must have i1 as its first argument", &Call);
6633+
for (auto [CallArg, FuncArg] :
6634+
drop_begin(zip_equal(Call.args(), F->args()))) {
6635+
Check(CallArg->getType() == FuncArg.getType(),
6636+
"Argument types must match", &Call);
6637+
6638+
// Check that inreg attributes match between call site and function
6639+
Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
6640+
FuncArg.hasInRegAttr(),
6641+
"Argument inreg attributes must match", &Call);
6642+
}
6643+
break;
6644+
}
66156645
case Intrinsic::amdgcn_s_prefetch_data: {
66166646
Check(
66176647
AMDGPU::isFlatGlobalAddrSpace(

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1464,9 +1464,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
14641464
CallLoweringInfo &Info) const {
14651465
if (Function *F = Info.CB->getCalledFunction())
14661466
if (F->isIntrinsic()) {
1467-
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1468-
"Unexpected intrinsic");
1469-
return lowerChainCall(MIRBuilder, Info);
1467+
switch (F->getIntrinsicID()) {
1468+
case Intrinsic::amdgcn_cs_chain:
1469+
return lowerChainCall(MIRBuilder, Info);
1470+
case Intrinsic::amdgcn_call_whole_wave:
1471+
Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1472+
1473+
// Get the callee from the original instruction, so it doesn't look like
1474+
// this is an indirect call.
1475+
Info.Callee = MachineOperand::CreateGA(
1476+
cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
1477+
Info.OrigArgs.erase(Info.OrigArgs.begin());
1478+
Info.IsVarArg = false;
1479+
break;
1480+
default:
1481+
llvm_unreachable("Unexpected intrinsic call");
1482+
}
14701483
}
14711484

14721485
if (Info.IsVarArg) {
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
4+
5+
declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
6+
7+
define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
8+
; DAGISEL-LABEL: basic_test:
9+
; DAGISEL: ; %bb.0:
10+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
11+
; DAGISEL-NEXT: s_wait_expcnt 0x0
12+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
13+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
14+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
15+
; DAGISEL-NEXT: s_mov_b32 s0, s33
16+
; DAGISEL-NEXT: s_mov_b32 s33, s32
17+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
18+
; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
19+
; DAGISEL-NEXT: s_wait_alu 0xfffe
20+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
21+
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
22+
; DAGISEL-NEXT: s_clause 0x1
23+
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
24+
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
25+
; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
26+
; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
27+
; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
28+
; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
29+
; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
30+
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
31+
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
32+
; DAGISEL-NEXT: s_wait_alu 0xfffe
33+
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
34+
; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
35+
; DAGISEL-NEXT: s_clause 0x1
36+
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
37+
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
38+
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
39+
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
40+
; DAGISEL-NEXT: s_mov_b32 s32, s33
41+
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
42+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
43+
; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
44+
; DAGISEL-NEXT: s_wait_alu 0xfffe
45+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
46+
; DAGISEL-NEXT: s_mov_b32 s33, s0
47+
; DAGISEL-NEXT: s_wait_loadcnt 0x0
48+
; DAGISEL-NEXT: s_wait_alu 0xfffe
49+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
50+
;
51+
; GISEL-LABEL: basic_test:
52+
; GISEL: ; %bb.0:
53+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
54+
; GISEL-NEXT: s_wait_expcnt 0x0
55+
; GISEL-NEXT: s_wait_samplecnt 0x0
56+
; GISEL-NEXT: s_wait_bvhcnt 0x0
57+
; GISEL-NEXT: s_wait_kmcnt 0x0
58+
; GISEL-NEXT: s_mov_b32 s0, s33
59+
; GISEL-NEXT: s_mov_b32 s33, s32
60+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
61+
; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
62+
; GISEL-NEXT: s_wait_alu 0xfffe
63+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
64+
; GISEL-NEXT: v_writelane_b32 v42, s0, 2
65+
; GISEL-NEXT: s_clause 0x1
66+
; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
67+
; GISEL-NEXT: scratch_store_b32 off, v41, s33
68+
; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
69+
; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
70+
; GISEL-NEXT: v_writelane_b32 v42, s30, 0
71+
; GISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
72+
; GISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
73+
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
74+
; GISEL-NEXT: v_writelane_b32 v42, s31, 1
75+
; GISEL-NEXT: s_wait_alu 0xfffe
76+
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
77+
; GISEL-NEXT: global_store_b32 v[40:41], v0, off
78+
; GISEL-NEXT: s_clause 0x1
79+
; GISEL-NEXT: scratch_load_b32 v41, off, s33
80+
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
81+
; GISEL-NEXT: v_readlane_b32 s31, v42, 1
82+
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
83+
; GISEL-NEXT: s_mov_b32 s32, s33
84+
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
85+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
86+
; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
87+
; GISEL-NEXT: s_wait_alu 0xfffe
88+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
89+
; GISEL-NEXT: s_mov_b32 s33, s0
90+
; GISEL-NEXT: s_wait_loadcnt 0x0
91+
; GISEL-NEXT: s_wait_alu 0xfffe
92+
; GISEL-NEXT: s_setpc_b64 s[30:31]
93+
%y = add i32 %x, 13
94+
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
95+
store i32 %ret, ptr addrspace(1) %ptr
96+
ret void
97+
}
98+
99+
declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
100+
101+
define amdgpu_gfx void @ret_void(i32 %x) {
102+
; DAGISEL-LABEL: ret_void:
103+
; DAGISEL: ; %bb.0:
104+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
105+
; DAGISEL-NEXT: s_wait_expcnt 0x0
106+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
107+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
108+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
109+
; DAGISEL-NEXT: s_mov_b32 s0, s33
110+
; DAGISEL-NEXT: s_mov_b32 s33, s32
111+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
112+
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
113+
; DAGISEL-NEXT: s_wait_alu 0xfffe
114+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
115+
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
116+
; DAGISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
117+
; DAGISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
118+
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
119+
; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
120+
; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
121+
; DAGISEL-NEXT: s_wait_alu 0xfffe
122+
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
123+
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
124+
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
125+
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
126+
; DAGISEL-NEXT: s_mov_b32 s32, s33
127+
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
128+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
129+
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
130+
; DAGISEL-NEXT: s_wait_alu 0xfffe
131+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
132+
; DAGISEL-NEXT: s_mov_b32 s33, s0
133+
; DAGISEL-NEXT: s_wait_loadcnt 0x0
134+
; DAGISEL-NEXT: s_wait_alu 0xfffe
135+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
136+
;
137+
; GISEL-LABEL: ret_void:
138+
; GISEL: ; %bb.0:
139+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
140+
; GISEL-NEXT: s_wait_expcnt 0x0
141+
; GISEL-NEXT: s_wait_samplecnt 0x0
142+
; GISEL-NEXT: s_wait_bvhcnt 0x0
143+
; GISEL-NEXT: s_wait_kmcnt 0x0
144+
; GISEL-NEXT: s_mov_b32 s0, s33
145+
; GISEL-NEXT: s_mov_b32 s33, s32
146+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
147+
; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
148+
; GISEL-NEXT: s_wait_alu 0xfffe
149+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
150+
; GISEL-NEXT: v_writelane_b32 v40, s0, 2
151+
; GISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
152+
; GISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
153+
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
154+
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
155+
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
156+
; GISEL-NEXT: s_wait_alu 0xfffe
157+
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
158+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
159+
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
160+
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
161+
; GISEL-NEXT: s_mov_b32 s32, s33
162+
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
163+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
164+
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
165+
; GISEL-NEXT: s_wait_alu 0xfffe
166+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
167+
; GISEL-NEXT: s_mov_b32 s33, s0
168+
; GISEL-NEXT: s_wait_loadcnt 0x0
169+
; GISEL-NEXT: s_wait_alu 0xfffe
170+
; GISEL-NEXT: s_setpc_b64 s[30:31]
171+
call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
172+
ret void
173+
}
174+

llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
101101
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
102102
ret i64 %ret
103103
}
104+
105+
declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
106+
107+
; Make sure we don't pass the first argument (i1).
108+
define amdgpu_cs void @call(i32 %x, ptr %p) {
109+
; CHECK-LABEL: name: call
110+
; CHECK: bb.1 (%ir-block.0):
111+
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
112+
; CHECK-NEXT: {{ $}}
113+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
114+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
115+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
116+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
117+
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
118+
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
119+
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
120+
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
121+
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
122+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
123+
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
124+
; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
125+
; CHECK-NEXT: S_ENDPGM 0
126+
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
127+
store i32 %ret, ptr %p
128+
ret void
129+
}

0 commit comments

Comments
 (0)