Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 84 additions & 13 deletions llvm/lib/Transforms/Scalar/Sink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/IR/Dominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
Expand Down Expand Up @@ -60,10 +63,64 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
return true;
}

static cl::opt<unsigned> SinkLoadStoreLimit(
"sink-load-store-limit", cl::Hidden, cl::init(4),
cl::desc("Maximum number of stores in descendant blocks that will be "
"analyzed when attempting to sink a load."));
Comment on lines +66 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you avoid this super specific flag? Is there some threshold MemorySSA usually uses? At worst can it be a pass parameter instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see a MaxCheckLimit in MemorySSA.cpp, but it is local to MemorySSA.cpp. EarlyCSE.cpp, DeadStoreElimination.cpp, and LICM.cpp all have hidden options setting limits on memory SSA usage. Is it okay if we stick with a hidden option? Otherwise, by "pass parameter" do you mean a limit specific to Sink that is not implemented as a user option?


using BlocksSet = SmallPtrSet<BasicBlock *, 8>;
static bool hasStoreConflict(BasicBlock *LoadBB, BasicBlock *BB,
BlocksSet &VisitedBlocksSet,
MemorySSAUpdater &MSSAU, BatchAAResults &BAA,
Instruction *ReadMemInst, unsigned &StoreCnt) {
if (BB == LoadBB || !VisitedBlocksSet.insert(BB).second)
return false;
if (auto *Accesses = MSSAU.getMemorySSA()->getBlockDefs(BB)) {
StoreCnt += Accesses->size();
if (StoreCnt > SinkLoadStoreLimit)
return true;
for (auto &MA : *Accesses) {
if (auto *MD = dyn_cast<MemoryDef>(&MA)) {
Instruction *S = MD->getMemoryInst();
if (LoadInst *L = dyn_cast<LoadInst>(ReadMemInst)) {
MemoryLocation Loc = MemoryLocation::get(L);
if (isModSet(BAA.getModRefInfo(S, Loc)))
return true;
} else if (auto *Call = dyn_cast<CallBase>(ReadMemInst)) {
if (isModSet(BAA.getModRefInfo(S, Call)))
return true;
}
}
}
}
for (BasicBlock *Pred : predecessors(BB)) {
if (hasStoreConflict(LoadBB, Pred, VisitedBlocksSet, MSSAU, BAA,
ReadMemInst, StoreCnt))
return true;
}
return false;
}

static bool hasConflictingStoreBeforeSuccToSinkTo(Instruction *ReadMemInst,
BasicBlock *SuccToSinkTo,
MemorySSAUpdater &MSSAU,
BatchAAResults &BAA) {
BlocksSet VisitedBlocksSet;
BasicBlock *LoadBB = ReadMemInst->getParent();
unsigned StoreCnt{0};

for (BasicBlock *Pred : predecessors(SuccToSinkTo))
if (hasStoreConflict(LoadBB, Pred, VisitedBlocksSet, MSSAU, BAA,
ReadMemInst, StoreCnt))
return true;
return false;
}

/// IsAcceptableTarget - Return true if it is possible to sink the instruction
/// in the specified basic block.
static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
DominatorTree &DT, LoopInfo &LI) {
DominatorTree &DT, LoopInfo &LI,
MemorySSAUpdater &MSSAU, BatchAAResults &BAA) {
assert(Inst && "Instruction to be sunk is null");
assert(SuccToSinkTo && "Candidate sink target is null");

Expand All @@ -76,10 +133,10 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
// just punt.
// FIXME: Split critical edges if not backedges.
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
// Ensure that there is no conflicting store on any path to SuccToSinkTo.
if (Inst->mayReadFromMemory() &&
!Inst->hasMetadata(LLVMContext::MD_invariant_load))
!Inst->hasMetadata(LLVMContext::MD_invariant_load) &&
hasConflictingStoreBeforeSuccToSinkTo(Inst, SuccToSinkTo, MSSAU, BAA))
return false;

// We don't want to sink across a critical edge if we don't dominate the
Expand All @@ -101,7 +158,8 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
/// instruction out of its current block into a successor.
static bool SinkInstruction(Instruction *Inst,
SmallPtrSetImpl<Instruction *> &Stores,
DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
DominatorTree &DT, LoopInfo &LI, AAResults &AA,
MemorySSAUpdater &MSSAU) {

// Don't sink static alloca instructions. CodeGen assumes allocas outside the
// entry block are dynamically sized stack objects.
Expand Down Expand Up @@ -152,8 +210,9 @@ static bool SinkInstruction(Instruction *Inst,
if (SuccToSinkTo) {
// The nearest common dominator may be in a parent loop of BB, which may not
// be beneficial. Find an ancestor.
BatchAAResults BAA(AA);
while (SuccToSinkTo != BB &&
!IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
!IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI, MSSAU, BAA))
SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
if (SuccToSinkTo == BB)
SuccToSinkTo = nullptr;
Expand All @@ -169,11 +228,15 @@ static bool SinkInstruction(Instruction *Inst,

// Move the instruction.
Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt());
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
MSSAU.getMemorySSA()->getMemoryAccess(Inst)))
MSSAU.moveToPlace(OldMemAcc, SuccToSinkTo, MemorySSA::Beginning);

return true;
}

static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
AAResults &AA) {
AAResults &AA, MemorySSAUpdater &MSSAU) {
// Don't bother sinking code out of unreachable blocks. In addition to being
// unprofitable, it can also lead to infinite looping, because in an
// unreachable loop there may be nowhere to stop.
Expand All @@ -198,7 +261,7 @@ static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
if (Inst->isDebugOrPseudoInst())
continue;

if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
if (SinkInstruction(Inst, Stores, DT, LI, AA, MSSAU)) {
++NumSunk;
MadeChange = true;
}
Expand All @@ -210,15 +273,16 @@ static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
}

static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
LoopInfo &LI, AAResults &AA) {
LoopInfo &LI, AAResults &AA,
MemorySSAUpdater &MSSAU) {
bool MadeChange, EverMadeChange = false;

do {
MadeChange = false;
LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
// Process all basic blocks.
for (BasicBlock &I : F)
MadeChange |= ProcessBlock(I, DT, LI, AA);
MadeChange |= ProcessBlock(I, DT, LI, AA, MSSAU);
EverMadeChange |= MadeChange;
NumSinkIter++;
} while (MadeChange);
Expand All @@ -230,12 +294,15 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
MemorySSAUpdater MSSAU(&MSSA);

if (!iterativelySinkInstructions(F, DT, LI, AA))
if (!iterativelySinkInstructions(F, DT, LI, AA, MSSAU))
return PreservedAnalyses::all();

PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<MemorySSAAnalysis>();
return PA;
}

Expand All @@ -251,8 +318,9 @@ namespace {
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();

return iterativelySinkInstructions(F, DT, LI, AA);
MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
MemorySSAUpdater MSSAU(MSSA);
return iterativelySinkInstructions(F, DT, LI, AA, MSSAU);
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
Expand All @@ -261,8 +329,10 @@ namespace {
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
}
};
} // end anonymous namespace
Expand All @@ -271,6 +341,7 @@ char SinkingLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)

Expand Down
86 changes: 44 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1330,13 +1330,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX7-NEXT: s_mov_b64 vcc, 0
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
Expand All @@ -1355,24 +1349,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX7-NEXT: s_or_b64 vcc, s[8:9], s[0:1]
; GFX7-NEXT: .LBB13_2: ; %exit
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX8-NEXT: s_mov_b64 vcc, 0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
Expand All @@ -1391,25 +1383,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX8-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX8-NEXT: .LBB13_2: ; %exit
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, v2
; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1]
; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 8
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W32: ; %bb.0: ; %entry
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W32-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX10_W32-NEXT: s_and_saveexec_b32 s1, s0
; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W32-NEXT: ; %bb.1: ; %bb
Expand All @@ -1426,22 +1422,23 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX10_W32-NEXT: .LBB13_2: ; %exit
; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W64: ; %bb.0: ; %entry
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10_W64-NEXT: s_mov_b64 vcc, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W64-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX10_W64-NEXT: s_mov_b64 vcc, 0
; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W64-NEXT: ; %bb.1: ; %bb
Expand All @@ -1458,24 +1455,25 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX10_W64-NEXT: .LBB13_2: ; %exit
; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8
; GFX10_W64-NEXT: s_endpgm
;
; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W32: ; %bb.0: ; %entry
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W32-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX11_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v3
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_mov_b32 s1, exec_lo
; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W32-NEXT: ; %bb.1: ; %bb
; GFX11_W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x50
Expand All @@ -1491,6 +1489,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX11_W32-NEXT: .LBB13_2: ; %exit
; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W32-NEXT: s_waitcnt vmcnt(0)
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
Expand All @@ -1501,14 +1503,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W64: ; %bb.0: ; %entry
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W64-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX11_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W64-NEXT: s_mov_b64 vcc, 0
; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v3
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W64-NEXT: ; %bb.1: ; %bb
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x50
Expand All @@ -1524,6 +1522,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX11_W64-NEXT: .LBB13_2: ; %exit
; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W64-NEXT: s_waitcnt vmcnt(0)
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
Expand Down
Loading
Loading