Skip to content

Commit b124f68

Browse files
krzysz00arsenm
authored andcommitted
[AMDGPU] Enable volatile and non-temporal for loads to LDS (llvm#153244)
The primary purpose of this commit is to enable marking loads to LDS (global.load.lds, buffer.*.load.lds) volatile (using bit 31 of the aux as with normal buffer loads) and to ensure that their !nontemporal annotations translate to appropriate settings of te cache control bits. However, in the process of implementing this feature, we also fixed - Incorrect handling of buffer loads to LDS in GlobalISel - Updating the handling of volatile on buffers in SIMemoryLegalizer: previously, the mapping of address spaces would cause volatile on buffer loads to be silently dropped on at least gfx10. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent 4b81d70 commit b124f68

13 files changed

+889
-48
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6512,6 +6512,13 @@ operations.
65126512
``buffer/global/flat_load/store/atomic`` instructions to global memory are
65136513
termed vector memory operations.
65146514

6515+
``global_load_lds`` or ``buffer/global_load`` instructions with the `lds` flag
6516+
are LDS DMA loads. They interact with caches as if the loaded data were
6517+
being loaded to registers and not to LDS, and so therefore support the same
6518+
cache modifiers. They cannot be performed atomically. They implement volatile
6519+
(via aux/cpol bit 31) and nontemporal (via metadata) as if they were loads
6520+
from the global address space.
6521+
65156522
Private address space uses ``buffer_load/store`` using the scratch V#
65166523
(GFX6-GFX8), or ``scratch_load/store`` (GFX9-GFX11). Since only a single thread
65176524
is accessing the memory, atomic memory orderings are not meaningful, and all

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2819,20 +2819,24 @@ class AMDGPULoadToLDS :
28192819
"", [SDNPMemOperand]>;
28202820
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
28212821

2822-
class AMDGPUGlobalLoadLDS :
2823-
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
2824-
Intrinsic <
2825-
[],
2826-
[LLVMQualPointerType<1>, // Base global pointer to load from
2827-
LLVMQualPointerType<3>, // LDS base pointer to store to
2828-
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
2829-
llvm_i32_ty, // imm offset (applied to both global and LDS address)
2830-
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2831-
// bit 1 = sc1,
2832-
// bit 4 = scc))
2833-
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2834-
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2835-
"", [SDNPMemOperand]>;
2822+
class AMDGPUGlobalLoadLDS
2823+
: ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
2824+
Intrinsic<
2825+
[],
2826+
[LLVMQualPointerType<1>, // Base global pointer to load from
2827+
LLVMQualPointerType<3>, // LDS base pointer to store to
2828+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
2829+
llvm_i32_ty, // imm offset (applied to both global and LDS address)
2830+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2831+
// bit 1 = sc1,
2832+
// bit 4 = scc,
2833+
// bit 31 = volatile
2834+
// (compiler
2835+
// implemented)))
2836+
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2837+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
2838+
IntrNoCallback, IntrNoFree],
2839+
"", [SDNPMemOperand]>;
28362840
def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
28372841

28382842
// This is IntrHasSideEffects because it reads from a volatile hardware register.

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
34463446
: 0); // swz
34473447

34483448
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3449+
// Don't set the offset value here because the pointer points to the base of
3450+
// the buffer.
34493451
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3450-
LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3452+
34513453
MachinePointerInfo StorePtrI = LoadPtrI;
3452-
StorePtrI.V = nullptr;
3454+
LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3455+
AMDGPUAS::BUFFER_RESOURCE));
3456+
LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
34533457
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
34543458

34553459
auto F = LoadMMO->getFlags() &
@@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
36273631
if (isSGPR(Addr))
36283632
MIB.addReg(VOffset);
36293633

3630-
MIB.add(MI.getOperand(4)) // offset
3631-
.add(MI.getOperand(5)); // cpol
3634+
MIB.add(MI.getOperand(4)); // offset
3635+
3636+
unsigned Aux = MI.getOperand(5).getImm();
3637+
MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
36323638

36333639
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
36343640
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
36353641
LoadPtrI.Offset = MI.getOperand(4).getImm();
36363642
MachinePointerInfo StorePtrI = LoadPtrI;
3643+
LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3644+
AMDGPUAS::GLOBAL_ADDRESS));
36373645
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
36383646
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
36393647
auto F = LoadMMO->getFlags() &

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,9 @@ enum CPol {
423423
// Volatile (used to preserve/signal operation volatility for buffer
424424
// operations not a real instruction bit)
425425
VOLATILE = 1 << 31,
426+
// The set of "cache policy" bits used for compiler features that
427+
// do not correspond to handware features.
428+
VIRTUAL_BITS = VOLATILE,
426429
};
427430

428431
} // namespace CPol

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,6 +1651,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
16511651
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
16521652
Info.ptrVal = CI.getArgOperand(1);
16531653
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1654+
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1655+
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1656+
Info.flags |= MachineMemOperand::MOVolatile;
16541657
return true;
16551658
}
16561659
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -11219,8 +11222,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1121911222

1122011223
MachinePointerInfo StorePtrI = LoadPtrI;
1122111224
LoadPtrI.V = PoisonValue::get(
11222-
PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
11223-
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
11225+
PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
11226+
LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
1122411227
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
1122511228

1122611229
auto F = LoadMMO->getFlags() &
@@ -11307,7 +11310,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1130711310
}
1130811311

1130911312
Ops.push_back(Op.getOperand(5)); // Offset
11310-
Ops.push_back(Op.getOperand(6)); // CPol
11313+
11314+
unsigned Aux = Op.getConstantOperandVal(6);
11315+
Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11316+
MVT::i32)); // CPol
11317+
1131111318
Ops.push_back(M0Val.getValue(0)); // Chain
1131211319
Ops.push_back(M0Val.getValue(1)); // Glue
1131311320

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/IR/DiagnosticInfo.h"
2626
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
2727
#include "llvm/IR/PassManager.h"
28+
#include "llvm/Support/AMDGPUAddrSpace.h"
2829
#include "llvm/Support/AtomicOrdering.h"
2930
#include "llvm/TargetParser/TargetParser.h"
3031

@@ -277,6 +278,12 @@ class SIMemOpAccess final {
277278
/// rmw operation, "std::nullopt" otherwise.
278279
std::optional<SIMemOpInfo>
279280
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281+
282+
/// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283+
/// along with an indication of whether this is a load or store. If it is not
284+
/// a direct-to-LDS operation, returns std::nullopt.
285+
std::optional<SIMemOpInfo>
286+
getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
280287
};
281288

282289
class SICacheControl {
@@ -703,6 +710,9 @@ class SIMemoryLegalizer final {
703710
/// instructions are added/deleted or \p MI is modified, false otherwise.
704711
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
705712
MachineBasicBlock::iterator &MI);
713+
/// Expands LDS DMA operation \p MI. Returns true if instructions are
714+
/// added/deleted or \p MI is modified, false otherwise.
715+
bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
706716

707717
public:
708718
SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -832,6 +842,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
832842
return SIAtomicAddrSpace::SCRATCH;
833843
if (AS == AMDGPUAS::REGION_ADDRESS)
834844
return SIAtomicAddrSpace::GDS;
845+
if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
846+
AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
847+
return SIAtomicAddrSpace::GLOBAL;
835848

836849
return SIAtomicAddrSpace::OTHER;
837850
}
@@ -987,6 +1000,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
9871000
return constructFromMIWithMMO(MI);
9881001
}
9891002

1003+
std::optional<SIMemOpInfo>
1004+
SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
1005+
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
1006+
1007+
if (!SIInstrInfo::isLDSDMA(*MI))
1008+
return std::nullopt;
1009+
1010+
return constructFromMIWithMMO(MI);
1011+
}
1012+
9901013
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
9911014
TII = ST.getInstrInfo();
9921015
IV = getIsaVersion(ST.getCPU());
@@ -1099,7 +1122,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
10991122
// Only handle load and store, not atomic read-modify-write insructions. The
11001123
// latter use glc to indicate if the atomic returns a result and so must not
11011124
// be used for cache control.
1102-
assert(MI->mayLoad() ^ MI->mayStore());
1125+
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
11031126

11041127
// Only update load and store, not LLVM IR atomic read-modify-write
11051128
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1429,7 +1452,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
14291452
// Only handle load and store, not atomic read-modify-write insructions. The
14301453
// latter use glc to indicate if the atomic returns a result and so must not
14311454
// be used for cache control.
1432-
assert(MI->mayLoad() ^ MI->mayStore());
1455+
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
14331456

14341457
// Only update load and store, not LLVM IR atomic read-modify-write
14351458
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1733,7 +1756,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
17331756
// Only handle load and store, not atomic read-modify-write insructions. The
17341757
// latter use glc to indicate if the atomic returns a result and so must not
17351758
// be used for cache control.
1736-
assert(MI->mayLoad() ^ MI->mayStore());
1759+
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
17371760

17381761
// Only update load and store, not LLVM IR atomic read-modify-write
17391762
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1968,7 +1991,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
19681991
// Only handle load and store, not atomic read-modify-write insructions. The
19691992
// latter use glc to indicate if the atomic returns a result and so must not
19701993
// be used for cache control.
1971-
assert(MI->mayLoad() ^ MI->mayStore());
1994+
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
19721995

19731996
// Only update load and store, not LLVM IR atomic read-modify-write
19741997
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2266,7 +2289,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
22662289
// Only handle load and store, not atomic read-modify-write insructions. The
22672290
// latter use glc to indicate if the atomic returns a result and so must not
22682291
// be used for cache control.
2269-
assert(MI->mayLoad() ^ MI->mayStore());
2292+
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
22702293

22712294
// Only update load and store, not LLVM IR atomic read-modify-write
22722295
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2611,7 +2634,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26112634
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
26122635

26132636
// Only handle load and store, not atomic read-modify-write instructions.
2614-
assert(MI->mayLoad() ^ MI->mayStore());
2637+
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
26152638

26162639
// Only update load and store, not LLVM IR atomic read-modify-write
26172640
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2934,6 +2957,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
29342957
return Changed;
29352958
}
29362959

2960+
bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2961+
MachineBasicBlock::iterator &MI) {
2962+
assert(MI->mayLoad() && MI->mayStore());
2963+
2964+
// The volatility or nontemporal-ness of the operation is a
2965+
// function of the global memory, not the LDS.
2966+
SIMemOp OpKind =
2967+
SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2968+
2969+
// Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2970+
// stores. The operation is treated as a volatile/nontemporal store
2971+
// to its second argument.
2972+
return CC->enableVolatileAndOrNonTemporal(
2973+
MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2974+
MOI.isNonTemporal(), MOI.isLastUse());
2975+
}
2976+
29372977
bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
29382978
const MachineModuleInfo &MMI =
29392979
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2985,14 +3025,17 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
29853025
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
29863026
continue;
29873027

2988-
if (const auto &MOI = MOA.getLoadInfo(MI))
3028+
if (const auto &MOI = MOA.getLoadInfo(MI)) {
29893029
Changed |= expandLoad(*MOI, MI);
2990-
else if (const auto &MOI = MOA.getStoreInfo(MI)) {
3030+
} else if (const auto &MOI = MOA.getStoreInfo(MI)) {
29913031
Changed |= expandStore(*MOI, MI);
2992-
} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
3032+
} else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
3033+
Changed |= expandLDSDMA(*MOI, MI);
3034+
} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
29933035
Changed |= expandAtomicFence(*MOI, MI);
2994-
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
3036+
} else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
29953037
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
3038+
}
29963039
}
29973040
}
29983041

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
36113611
; SDAG: ; %bb.0:
36123612
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36133613
; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
3614+
; SDAG-NEXT: s_waitcnt vmcnt(0)
36143615
; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
3615-
; SDAG-NEXT: s_waitcnt vmcnt(1)
3616-
; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
36173616
; SDAG-NEXT: s_waitcnt vmcnt(0)
3617+
; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
36183618
; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
36193619
; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
36203620
; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
36273627
; GISEL: ; %bb.0:
36283628
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36293629
; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
3630+
; GISEL-NEXT: s_waitcnt vmcnt(0)
36303631
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
3631-
; GISEL-NEXT: s_waitcnt vmcnt(1)
3632+
; GISEL-NEXT: s_waitcnt vmcnt(0)
36323633
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
36333634
; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
36343635
; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
3635-
; GISEL-NEXT: s_waitcnt vmcnt(0)
36363636
; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
36373637
; GISEL-NEXT: s_setpc_b64 s[30:31]
36383638
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
36523652
; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36533653
; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
36543654
; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
3655+
; SDAG-NEXT: s_waitcnt vmcnt(0)
36553656
; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
36563657
; SDAG-NEXT: s_waitcnt vmcnt(0)
36573658
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
36713672
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
36723673
; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
36733674
; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
3675+
; GISEL-NEXT: s_waitcnt vmcnt(0)
36743676
; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
36753677
; GISEL-NEXT: s_waitcnt vmcnt(0)
36763678
; GISEL-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)