diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp index 1a48a59c4189e..07554857c2c97 100644 --- a/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/llvm/lib/Transforms/Scalar/Sink.cpp @@ -15,8 +15,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/IR/Dominators.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -60,10 +63,64 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, return true; } +static cl::opt SinkLoadStoreLimit( + "sink-load-store-limit", cl::Hidden, cl::init(4), + cl::desc("Maximum number of stores in descendant blocks that will be " + "analyzed when attempting to sink a load.")); + +using BlocksSet = SmallPtrSet; +static bool hasStoreConflict(BasicBlock *LoadBB, BasicBlock *BB, + BlocksSet &VisitedBlocksSet, + MemorySSAUpdater &MSSAU, BatchAAResults &BAA, + Instruction *ReadMemInst, unsigned &StoreCnt) { + if (BB == LoadBB || !VisitedBlocksSet.insert(BB).second) + return false; + if (auto *Accesses = MSSAU.getMemorySSA()->getBlockDefs(BB)) { + StoreCnt += Accesses->size(); + if (StoreCnt > SinkLoadStoreLimit) + return true; + for (auto &MA : *Accesses) { + if (auto *MD = dyn_cast(&MA)) { + Instruction *S = MD->getMemoryInst(); + if (LoadInst *L = dyn_cast(ReadMemInst)) { + MemoryLocation Loc = MemoryLocation::get(L); + if (isModSet(BAA.getModRefInfo(S, Loc))) + return true; + } else if (auto *Call = dyn_cast(ReadMemInst)) { + if (isModSet(BAA.getModRefInfo(S, Call))) + return true; + } + } + } + } + for (BasicBlock *Pred : predecessors(BB)) { + if (hasStoreConflict(LoadBB, Pred, VisitedBlocksSet, MSSAU, BAA, + ReadMemInst, StoreCnt)) + return true; + } + return false; +} + +static bool hasConflictingStoreBeforeSuccToSinkTo(Instruction *ReadMemInst, + BasicBlock *SuccToSinkTo, + MemorySSAUpdater &MSSAU, + BatchAAResults &BAA) { + BlocksSet VisitedBlocksSet; + BasicBlock *LoadBB = ReadMemInst->getParent(); + unsigned StoreCnt{0}; + + for (BasicBlock *Pred : predecessors(SuccToSinkTo)) + if (hasStoreConflict(LoadBB, Pred, VisitedBlocksSet, MSSAU, BAA, + ReadMemInst, StoreCnt)) + return true; + return false; +} + /// IsAcceptableTarget - Return true if it is possible to sink the instruction /// in the specified basic block. static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, + MemorySSAUpdater &MSSAU, BatchAAResults &BAA) { assert(Inst && "Instruction to be sunk is null"); assert(SuccToSinkTo && "Candidate sink target is null"); @@ -76,10 +133,10 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, // just punt. // FIXME: Split critical edges if not backedges. if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { - // We cannot sink a load across a critical edge - there may be stores in - // other code paths. + // Ensure that there is no conflicting store on any path to SuccToSinkTo. if (Inst->mayReadFromMemory() && - !Inst->hasMetadata(LLVMContext::MD_invariant_load)) + !Inst->hasMetadata(LLVMContext::MD_invariant_load) && + hasConflictingStoreBeforeSuccToSinkTo(Inst, SuccToSinkTo, MSSAU, BAA)) return false; // We don't want to sink across a critical edge if we don't dominate the @@ -101,7 +158,8 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, /// instruction out of its current block into a successor. static bool SinkInstruction(Instruction *Inst, SmallPtrSetImpl &Stores, - DominatorTree &DT, LoopInfo &LI, AAResults &AA) { + DominatorTree &DT, LoopInfo &LI, AAResults &AA, + MemorySSAUpdater &MSSAU) { // Don't sink static alloca instructions. CodeGen assumes allocas outside the // entry block are dynamically sized stack objects. @@ -152,8 +210,9 @@ static bool SinkInstruction(Instruction *Inst, if (SuccToSinkTo) { // The nearest common dominator may be in a parent loop of BB, which may not // be beneficial. Find an ancestor. + BatchAAResults BAA(AA); while (SuccToSinkTo != BB && - !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI)) + !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI, MSSAU, BAA)) SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock(); if (SuccToSinkTo == BB) SuccToSinkTo = nullptr; @@ -169,11 +228,15 @@ static bool SinkInstruction(Instruction *Inst, // Move the instruction. Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); + if (MemoryUseOrDef *OldMemAcc = cast_or_null( + MSSAU.getMemorySSA()->getMemoryAccess(Inst))) + MSSAU.moveToPlace(OldMemAcc, SuccToSinkTo, MemorySSA::Beginning); + return true; } static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, - AAResults &AA) { + AAResults &AA, MemorySSAUpdater &MSSAU) { // Don't bother sinking code out of unreachable blocks. In addition to being // unprofitable, it can also lead to infinite looping, because in an // unreachable loop there may be nowhere to stop. @@ -198,7 +261,7 @@ static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, if (Inst->isDebugOrPseudoInst()) continue; - if (SinkInstruction(Inst, Stores, DT, LI, AA)) { + if (SinkInstruction(Inst, Stores, DT, LI, AA, MSSAU)) { ++NumSunk; MadeChange = true; } @@ -210,7 +273,8 @@ static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, } static bool iterativelySinkInstructions(Function &F, DominatorTree &DT, - LoopInfo &LI, AAResults &AA) { + LoopInfo &LI, AAResults &AA, + MemorySSAUpdater &MSSAU) { bool MadeChange, EverMadeChange = false; do { @@ -218,7 +282,7 @@ static bool iterativelySinkInstructions(Function &F, DominatorTree &DT, LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); // Process all basic blocks. for (BasicBlock &I : F) - MadeChange |= ProcessBlock(I, DT, LI, AA); + MadeChange |= ProcessBlock(I, DT, LI, AA, MSSAU); EverMadeChange |= MadeChange; NumSinkIter++; } while (MadeChange); @@ -230,12 +294,15 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &LI = AM.getResult(F); auto &AA = AM.getResult(F); + MemorySSA &MSSA = AM.getResult(F).getMSSA(); + MemorySSAUpdater MSSAU(&MSSA); - if (!iterativelySinkInstructions(F, DT, LI, AA)) + if (!iterativelySinkInstructions(F, DT, LI, AA, MSSAU)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet(); + PA.preserve(); return PA; } @@ -251,8 +318,9 @@ namespace { auto &DT = getAnalysis().getDomTree(); auto &LI = getAnalysis().getLoopInfo(); auto &AA = getAnalysis().getAAResults(); - - return iterativelySinkInstructions(F, DT, LI, AA); + MemorySSA *MSSA = &getAnalysis().getMSSA(); + MemorySSAUpdater MSSAU(MSSA); + return iterativelySinkInstructions(F, DT, LI, AA, MSSAU); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -261,8 +329,10 @@ namespace { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); } }; } // end anonymous namespace @@ -271,6 +341,7 @@ char SinkingLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 074272f7bed86..28ade94040688 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1330,13 +1330,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) { ; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX7-NEXT: s_mov_b64 vcc, 0 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] @@ -1355,24 +1349,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX7-NEXT: s_or_b64 vcc, s[8:9], s[0:1] ; GFX7-NEXT: .LBB13_2: ; %exit ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX8-NEXT: s_mov_b64 vcc, 0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] @@ -1391,12 +1383,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX8-NEXT: s_or_b64 vcc, s[6:7], s[0:1] ; GFX8-NEXT: .LBB13_2: ; %exit ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1] +; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 8 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1404,12 +1404,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W32: ; %bb.0: ; %entry -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 -; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1] ; GFX10_W32-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10_W32-NEXT: s_and_saveexec_b32 s1, s0 ; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb @@ -1426,9 +1422,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX10_W32-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX10_W32-NEXT: .LBB13_2: ; %exit ; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W32-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1] +; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8 @@ -1436,12 +1437,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W64: ; %bb.0: ; %entry -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 -; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W64-NEXT: s_mov_b64 vcc, 0 -; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1] ; GFX10_W64-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX10_W64-NEXT: s_mov_b64 vcc, 0 ; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb @@ -1458,9 +1455,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX10_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1] ; GFX10_W64-NEXT: .LBB13_2: ; %exit ; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W64-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1] +; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8 @@ -1468,14 +1470,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX11_W32: ; %bb.0: ; %entry -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28 -; GFX11_W32-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX11_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v3 -; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1] ; GFX11_W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W32-NEXT: ; %bb.1: ; %bb ; GFX11_W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x50 @@ -1491,6 +1489,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX11_W32-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX11_W32-NEXT: .LBB13_2: ; %exit ; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28 +; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1] ; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11_W32-NEXT: s_waitcnt vmcnt(0) ; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2 @@ -1501,14 +1503,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX11_W64: ; %bb.0: ; %entry -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28 -; GFX11_W64-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX11_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11_W64-NEXT: s_mov_b64 vcc, 0 ; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec -; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v3 -; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1] -; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W64-NEXT: ; %bb.1: ; %bb ; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x50 @@ -1524,6 +1522,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX11_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1] ; GFX11_W64-NEXT: .LBB13_2: ; %exit ; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28 +; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1] ; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) ; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 7179f687c70f2..0d6fa3a80b883 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -249,6 +249,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Memory SSA ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis @@ -550,6 +551,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Memory SSA ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis @@ -863,6 +865,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis @@ -1191,6 +1194,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 4a6b2ebd3d203..500659ea0ca86 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -877,14 +877,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB1_3 Depth 2 ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 ; CHECK-NEXT: s_lshl_b32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s53, s4, 1 ; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v46, v0 -; CHECK-NEXT: v_mov_b32_e32 v56, s53 +; CHECK-NEXT: v_or3_b32 v46, s5, v42, s53 +; CHECK-NEXT: v_mov_b32_e32 v47, s53 ; CHECK-NEXT: s_mov_b32 s5, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_5 @@ -898,46 +895,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_add_i32 s7, s7, 4 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 ; CHECK-NEXT: s_add_i32 s8, s4, s7 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v46 ; CHECK-NEXT: s_add_i32 s9, s8, 5 ; CHECK-NEXT: s_add_i32 s8, s8, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41 -; CHECK-NEXT: v_mov_b32_e32 v56, s8 +; CHECK-NEXT: v_mov_b32_e32 v47, s8 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 ; CHECK-NEXT: ; %bb.4: ; %Flow3 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: v_mov_b32_e32 v47, v0 +; CHECK-NEXT: v_mov_b32_e32 v46, v0 ; CHECK-NEXT: .LBB1_5: ; %Flow4 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_mov_b32 s54, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 +; CHECK-NEXT: v_cmpx_lt_u32_e64 v47, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_11 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 ; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: ds_read_u8 v56, v0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB1_8 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_7: ; %.114 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64 -; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 +; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v47, v41 ; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_8: ; %.103 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v47 ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD ; CHECK-NEXT: s_and_saveexec_b32 s64, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_7 ; CHECK-NEXT: ; %bb.9: ; %.110 @@ -958,7 +957,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v47 +; CHECK-NEXT: ds_write_b32 v0, v46 ; CHECK-NEXT: s_branch .LBB1_7 ; CHECK-NEXT: .LBB1_10: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index c5732531f5423..ec95a7ed03b95 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % } ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: -; GFX908: NumSgprs: 64 -; GFX908-GCNTRACKERS: NumSgprs: 64 +; GFX908: NumSgprs: 56 +; GFX908-GCNTRACKERS: NumSgprs: 56 ; GFX908: NumVgprs: 43 ; GFX908-GCNTRACKERS: NumVgprs: 39 ; GFX908: Occupancy: 5 diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll index a27d1217031ca..0e30b4bb5925c 100644 --- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -72,13 +72,14 @@ entry: a: %v2 = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0) + %v3 = fadd <2 x float> %v1, %v2 %v20 = extractelement <2 x float> %v2, i32 0 %v21 = extractelement <2 x float> %v2, i32 1 %cond2 = fcmp ult float %v20, %v21 br i1 %cond2, label %b, label %c b: - ret <2 x float> %v2 + ret <2 x float> %v3 c: %v4 = fadd <2 x float> %v1, %v1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 8dfd841671730..7426ecca7301a 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -41,7 +41,8 @@ bb: %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 %tmp22 = extractelement <4 x float> %tmp18, i32 3 - %tmp23 = bitcast float %tmp14 to i32 + %tmp23 = fadd float %tmp14, %tmp22 + %tmp24 = bitcast float %tmp23 to i32 br label %bb24 bb24: ; preds = %bb157, %bb @@ -218,7 +219,7 @@ bb156: ; preds = %bb24 bb157: ; preds = %bb24 %tmp158 = bitcast float %tmp107 to i32 %tmp159 = bitcast float %tmp107 to i32 - %tmp160 = add i32 %tmp23, %tmp159 + %tmp160 = add i32 %tmp24, %tmp159 %tmp161 = bitcast i32 %tmp160 to float %tmp162 = insertelement <128 x float> poison, float %tmp103, i32 0 %tmp163 = insertelement <128 x float> %tmp162, float %tmp102, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 4212fd3b35cd8..396c06cfbc540 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1266,26 +1266,26 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 -; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] ; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v1, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo ; GFX1032-NEXT: .LBB22_2: ; %exit -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx3 v[0:2], v0, s[10:11] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX1032-NEXT: global_store_dword v0, v1, s[8:9] offset:8 +; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GFX1032-NEXT: global_store_dword v3, v0, s[8:9] offset:8 ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: @@ -1293,26 +1293,26 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v1, s[6:7] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec ; GFX1064-NEXT: .LBB22_2: ; %exit -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx3 v[0:2], v0, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX1064-NEXT: global_store_dword v0, v1, s[8:9] offset:8 +; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GFX1064-NEXT: global_store_dword v3, v0, s[8:9] offset:8 ; GFX1064-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/Transforms/Sink/loadsink-limit.ll b/llvm/test/Transforms/Sink/loadsink-limit.ll new file mode 100644 index 0000000000000..471b912390625 --- /dev/null +++ b/llvm/test/Transforms/Sink/loadsink-limit.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S < %s -passes=sink -sink-load-store-limit=2 | FileCheck %s --check-prefix=CHECK_LIMIT_2 +; RUN: opt -S < %s -passes=sink -sink-load-store-limit=3 | FileCheck %s --check-prefix=CHECK_LIMIT_3 + +; Test -sink-load-store-limit option + +; Load can be sunk if -sink-load-store-limit >=3. There are 3 stores to analyze in block thenA. +define void @load_can_sink_noalias(i1 %condReturn, i1 %condA, i1 %condB, ptr noalias %a, ptr %b, ptr %c, ptr %d) { +; CHECK_LIMIT_2-LABEL: define void @load_can_sink_noalias( +; CHECK_LIMIT_2-SAME: i1 [[CONDRETURN:%.*]], i1 [[CONDA:%.*]], i1 [[CONDB:%.*]], ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]]) { +; CHECK_LIMIT_2-NEXT: [[ENTRY:.*:]] +; CHECK_LIMIT_2-NEXT: br i1 [[CONDRETURN]], label %[[IFA:.*]], label %[[RETURN_BLOCK:.*]] +; CHECK_LIMIT_2: [[IFA]]: +; CHECK_LIMIT_2-NEXT: [[VALUE:%.*]] = load i32, ptr [[A]], align 4 +; CHECK_LIMIT_2-NEXT: br i1 [[CONDA]], label %[[THENA:.*]], label %[[IFB:.*]] +; CHECK_LIMIT_2: [[THENA]]: +; CHECK_LIMIT_2-NEXT: store i32 0, ptr [[B]], align 4 +; CHECK_LIMIT_2-NEXT: store i32 1, ptr [[C]], align 4 +; CHECK_LIMIT_2-NEXT: store i32 2, ptr [[D]], align 4 +; CHECK_LIMIT_2-NEXT: br label %[[IFB]] +; CHECK_LIMIT_2: [[IFB]]: +; CHECK_LIMIT_2-NEXT: br i1 [[CONDB]], label %[[THENB:.*]], label %[[RETURN_BLOCK]] +; CHECK_LIMIT_2: [[THENB]]: +; CHECK_LIMIT_2-NEXT: store i32 [[VALUE]], ptr [[B]], align 4 +; CHECK_LIMIT_2-NEXT: br label %[[RETURN_BLOCK]] +; CHECK_LIMIT_2: [[RETURN_BLOCK]]: +; CHECK_LIMIT_2-NEXT: ret void +; +; CHECK_LIMIT_3-LABEL: define void @load_can_sink_noalias( +; CHECK_LIMIT_3-SAME: i1 [[CONDRETURN:%.*]], i1 [[CONDA:%.*]], i1 [[CONDB:%.*]], ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]]) { +; CHECK_LIMIT_3-NEXT: [[ENTRY:.*:]] +; CHECK_LIMIT_3-NEXT: br i1 [[CONDRETURN]], label %[[IFA:.*]], label %[[RETURN_BLOCK:.*]] +; CHECK_LIMIT_3: [[IFA]]: +; CHECK_LIMIT_3-NEXT: br i1 [[CONDA]], label %[[THENA:.*]], label %[[IFB:.*]] +; CHECK_LIMIT_3: [[THENA]]: +; CHECK_LIMIT_3-NEXT: store i32 0, ptr [[B]], align 4 +; CHECK_LIMIT_3-NEXT: store i32 1, ptr [[C]], align 4 +; CHECK_LIMIT_3-NEXT: store i32 2, ptr [[D]], align 4 +; CHECK_LIMIT_3-NEXT: br label %[[IFB]] +; CHECK_LIMIT_3: [[IFB]]: +; CHECK_LIMIT_3-NEXT: br i1 [[CONDB]], label %[[THENB:.*]], label %[[RETURN_BLOCK]] +; CHECK_LIMIT_3: [[THENB]]: +; CHECK_LIMIT_3-NEXT: [[VALUE:%.*]] = load i32, ptr [[A]], align 4 +; CHECK_LIMIT_3-NEXT: store i32 [[VALUE]], ptr [[B]], align 4 +; CHECK_LIMIT_3-NEXT: br label %[[RETURN_BLOCK]] +; CHECK_LIMIT_3: [[RETURN_BLOCK]]: +; CHECK_LIMIT_3-NEXT: ret void +; +entry: + %value = load i32, ptr %a, align 4 + br i1 %condReturn, label %ifA, label %return_block +ifA: + br i1 %condA, label %thenA, label %ifB +thenA: + store i32 0, ptr %b + store i32 1, ptr %c + store i32 2, ptr %d + br label %ifB +ifB: + br i1 %condB, label %thenB, label %return_block +thenB: + store i32 %value, ptr %b + br label %return_block +return_block: + ret void +} + diff --git a/llvm/test/Transforms/Sink/loadsink.ll b/llvm/test/Transforms/Sink/loadsink.ll new file mode 100644 index 0000000000000..9a9c106559d44 --- /dev/null +++ b/llvm/test/Transforms/Sink/loadsink.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -passes=sink | FileCheck %s + +; Test that loads can be sunk to a non-immediate successor block by analyzing +; paths for conflicting stores. + +declare void @readfunc() readonly willreturn +declare void @maywritefunc() willreturn + +; Load can be sunk to non-immediate successor +define void @load_can_sink(i1 %condA, i1 %condB, ptr %a, ptr %b) { +; CHECK-LABEL: @load_can_sink( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MERGEA:%.*]] +; CHECK: mergeA: +; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]] +; CHECK: thenA: +; CHECK-NEXT: call void @readfunc() +; CHECK-NEXT: br label [[MERGEB]] +; CHECK: mergeB: +; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]] +; CHECK: thenB: +; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: store i32 [[VALUE]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[MERGEC]] +; CHECK: mergeC: +; CHECK-NEXT: ret void +; +entry: + %value = load i32, ptr %a, align 4 + br label %mergeA +mergeA: + br i1 %condA, label %thenA, label %mergeB +thenA: + call void @readfunc() + br label %mergeB +mergeB: + br i1 %condB, label %thenB, label %mergeC +thenB: + store i32 %value, ptr %b + br label %mergeC +mergeC: + ret void +} + +; Call may store so load cannot be sunk +define void @load_cannot_sink(i1 %condA, i1 %condB, ptr %a, ptr %b) { +; CHECK-LABEL: @load_cannot_sink( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MERGEA:%.*]] +; CHECK: mergeA: +; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]] +; CHECK: thenA: +; CHECK-NEXT: call void @maywritefunc() +; CHECK-NEXT: br label [[MERGEB]] +; CHECK: mergeB: +; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]] +; CHECK: thenB: +; CHECK-NEXT: store i32 [[VALUE]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[MERGEC]] +; CHECK: mergeC: +; CHECK-NEXT: ret void +; +entry: + %value = load i32, ptr %a, align 4 + br label %mergeA +mergeA: + br i1 %condA, label %thenA, label %mergeB +thenA: + call void @maywritefunc() + br label %mergeB +mergeB: + br i1 %condB, label %thenB, label %mergeC +thenB: + store i32 %value, ptr %b + br label %mergeC +mergeC: + ret void +} + +; Load can be sunk to non-immediate successor because load ptr is noalias +define void @load_can_sink_noalias(i1 %condA, i1 %condB, ptr noalias %a, ptr %b) { +; CHECK-LABEL: @load_can_sink_noalias( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MERGEA:%.*]] +; CHECK: mergeA: +; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]] +; CHECK: thenA: +; CHECK-NEXT: store i32 0, ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[MERGEB]] +; CHECK: mergeB: +; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]] +; CHECK: thenB: +; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: store i32 [[VALUE]], ptr [[B]], align 4 +; CHECK-NEXT: br label [[MERGEC]] +; CHECK: mergeC: +; CHECK-NEXT: ret void +; +entry: + %value = load i32, ptr %a, align 4 + br label %mergeA +mergeA: + br i1 %condA, label %thenA, label %mergeB +thenA: + store i32 0, ptr %b + br label %mergeB +mergeB: + br i1 %condB, label %thenB, label %mergeC +thenB: + store i32 %value, ptr %b + br label %mergeC +mergeC: + ret void +} + +; Load cannot be sunk to non-immediate successor because load ptr may alias +define void @load_cannot_sink_alias(i1 %condA, i1 %condB, ptr %a, ptr %b) { +; CHECK-LABEL: @load_cannot_sink_alias( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MERGEA:%.*]] +; CHECK: mergeA: +; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]] +; CHECK: thenA: +; CHECK-NEXT: store i32 0, ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[MERGEB]] +; CHECK: mergeB: +; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]] +; CHECK: thenB: +; CHECK-NEXT: store i32 [[VALUE]], ptr [[B]], align 4 +; CHECK-NEXT: br label [[MERGEC]] +; CHECK: mergeC: +; CHECK-NEXT: ret void +; +entry: + %value = load i32, ptr %a, align 4 + br label %mergeA +mergeA: + br i1 %condA, label %thenA, label %mergeB +thenA: + store i32 0, ptr %b + br label %mergeB +mergeB: + br i1 %condB, label %thenB, label %mergeC +thenB: + store i32 %value, ptr %b + br label %mergeC +mergeC: + ret void +} + +; Load can be sunk, but not all the way to the use. +define void @load_can_sink_part_of_the_way(i1 %condA, i1 %condB, i1 %condC, ptr noalias %a, ptr %b) { +; CHECK-LABEL: @load_can_sink_part_of_the_way( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MERGEA:%.*]] +; CHECK: mergeA: +; CHECK-NEXT: br i1 [[CONDA:%.*]], label [[THENA:%.*]], label [[MERGEB:%.*]] +; CHECK: thenA: +; CHECK-NEXT: store i32 0, ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[MERGEB]] +; CHECK: mergeB: +; CHECK-NEXT: [[VALUE:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: br i1 [[CONDB:%.*]], label [[THENB:%.*]], label [[MERGEC:%.*]] +; CHECK: thenB: +; CHECK-NEXT: call void @maywritefunc() +; CHECK-NEXT: br label [[MERGEC]] +; CHECK: mergeC: +; CHECK-NEXT: br i1 [[CONDC:%.*]], label [[THENC:%.*]], label [[MERGED:%.*]] +; CHECK: thenC: +; CHECK-NEXT: store i32 [[VALUE]], ptr [[B]], align 4 +; CHECK-NEXT: br label [[MERGEC]] +; CHECK: mergeD: +; CHECK-NEXT: ret void +; +entry: + %value = load i32, ptr %a, align 4 + br label %mergeA +mergeA: + br i1 %condA, label %thenA, label %mergeB +thenA: + store i32 0, ptr %b + br label %mergeB +mergeB: + br i1 %condB, label %thenB, label %mergeC +thenB: + call void @maywritefunc() + br label %mergeC +mergeC: + br i1 %condC, label %thenC, label %mergeD +thenC: + store i32 %value, ptr %b + br label %mergeC +mergeD: + ret void +}