Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions llvm/lib/Transforms/Scalar/Sink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ using namespace llvm;
STATISTIC(NumSunk, "Number of instructions sunk");
STATISTIC(NumSinkIter, "Number of sinking iterations");

static bool hasStoreConflict(Instruction *Inst, AliasAnalysis &AA,
SmallPtrSetImpl<Instruction *> &Stores) {
if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
MemoryLocation Loc = MemoryLocation::get(L);
for (Instruction *S : Stores)
if (isModSet(AA.getModRefInfo(S, Loc)))
return true;
} else if (auto *Call = dyn_cast<CallBase>(Inst)) {
for (Instruction *S : Stores)
if (isModSet(AA.getModRefInfo(S, Call)))
return true;
}
return false;
}

static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
SmallPtrSetImpl<Instruction *> &Stores) {

Expand Down Expand Up @@ -60,10 +75,36 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
return true;
}

using BlocksSet = SmallPtrSet<BasicBlock *, 8>;
static void findStores(SmallPtrSetImpl<Instruction *> &Stores,
BasicBlock *LoadBB, BasicBlock *BB,
BlocksSet &VisitedBlocksSet) {
if (BB == LoadBB || !VisitedBlocksSet.insert(BB).second)
return;

for (Instruction &Inst : *BB)
if (Inst.mayWriteToMemory())
Stores.insert(&Inst);
for (BasicBlock *Pred : predecessors(BB))
findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
}

static bool hasConflictingStoreBeforeSuccToSinkTo(AliasAnalysis &AA,
Instruction *ReadMemInst,
BasicBlock *SuccToSinkTo) {
BlocksSet VisitedBlocksSet;
SmallPtrSet<Instruction *, 8> Stores;
BasicBlock *LoadBB = ReadMemInst->getParent();
for (BasicBlock *Pred : predecessors(SuccToSinkTo))
findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
return hasStoreConflict(ReadMemInst, AA, Stores);
}

/// IsAcceptableTarget - Return true if it is possible to sink the instruction
/// in the specified basic block.
static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
DominatorTree &DT, LoopInfo &LI) {
static bool IsAcceptableTarget(AliasAnalysis &AA, Instruction *Inst,
BasicBlock *SuccToSinkTo, DominatorTree &DT,
LoopInfo &LI) {
assert(Inst && "Instruction to be sunk is null");
assert(SuccToSinkTo && "Candidate sink target is null");

Expand All @@ -76,10 +117,10 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
// just punt.
// FIXME: Split critical edges if not backedges.
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
// Ensure that there is no conflicting store on any path to SuccToSinkTo.
if (Inst->mayReadFromMemory() &&
!Inst->hasMetadata(LLVMContext::MD_invariant_load))
!Inst->hasMetadata(LLVMContext::MD_invariant_load) &&
hasConflictingStoreBeforeSuccToSinkTo(AA, Inst, SuccToSinkTo))
return false;

// We don't want to sink across a critical edge if we don't dominate the
Expand Down Expand Up @@ -153,7 +194,7 @@ static bool SinkInstruction(Instruction *Inst,
// The nearest common dominator may be in a parent loop of BB, which may not
// be beneficial. Find an ancestor.
while (SuccToSinkTo != BB &&
!IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
!IsAcceptableTarget(AA, Inst, SuccToSinkTo, DT, LI))
SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
if (SuccToSinkTo == BB)
SuccToSinkTo = nullptr;
Expand Down
86 changes: 44 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1330,13 +1330,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX7-NEXT: s_mov_b64 vcc, 0
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
Expand All @@ -1355,24 +1349,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX7-NEXT: s_or_b64 vcc, s[8:9], s[0:1]
; GFX7-NEXT: .LBB13_2: ; %exit
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX8-NEXT: s_mov_b64 vcc, 0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
Expand All @@ -1391,25 +1383,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX8-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX8-NEXT: .LBB13_2: ; %exit
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, v2
; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1]
; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 8
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W32: ; %bb.0: ; %entry
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W32-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX10_W32-NEXT: s_and_saveexec_b32 s1, s0
; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W32-NEXT: ; %bb.1: ; %bb
Expand All @@ -1426,22 +1422,23 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX10_W32-NEXT: .LBB13_2: ; %exit
; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W64: ; %bb.0: ; %entry
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10_W64-NEXT: s_mov_b64 vcc, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W64-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX10_W64-NEXT: s_mov_b64 vcc, 0
; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W64-NEXT: ; %bb.1: ; %bb
Expand All @@ -1458,24 +1455,25 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX10_W64-NEXT: .LBB13_2: ; %exit
; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; GFX10_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8
; GFX10_W64-NEXT: s_endpgm
;
; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W32: ; %bb.0: ; %entry
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W32-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX11_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v3
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_mov_b32 s1, exec_lo
; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W32-NEXT: ; %bb.1: ; %bb
; GFX11_W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x50
Expand All @@ -1491,6 +1489,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX11_W32-NEXT: .LBB13_2: ; %exit
; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W32-NEXT: s_waitcnt vmcnt(0)
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
Expand All @@ -1501,14 +1503,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W64: ; %bb.0: ; %entry
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W64-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX11_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W64-NEXT: s_mov_b64 vcc, 0
; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v3
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W64-NEXT: ; %bb.1: ; %bb
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x50
Expand All @@ -1524,6 +1522,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX11_W64-NEXT: .LBB13_2: ; %exit
; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W64-NEXT: s_waitcnt vmcnt(0)
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -877,14 +877,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_lshl_b32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s53, s4, 1
; CHECK-NEXT: s_add_i32 s6, s4, 5
; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_read_u8 v46, v0
; CHECK-NEXT: v_mov_b32_e32 v56, s53
; CHECK-NEXT: v_or3_b32 v46, s5, v42, s53
; CHECK-NEXT: v_mov_b32_e32 v47, s53
; CHECK-NEXT: s_mov_b32 s5, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_5
Expand All @@ -898,46 +895,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_add_i32 s7, s7, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_add_i32 s8, s4, s7
; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v46
; CHECK-NEXT: s_add_i32 s9, s8, 5
; CHECK-NEXT: s_add_i32 s8, s8, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
; CHECK-NEXT: v_mov_b32_e32 v56, s8
; CHECK-NEXT: v_mov_b32_e32 v47, s8
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
; CHECK-NEXT: ; %bb.4: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: v_mov_b32_e32 v47, v0
; CHECK-NEXT: v_mov_b32_e32 v46, v0
; CHECK-NEXT: .LBB1_5: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_mov_b32 s54, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
; CHECK-NEXT: v_cmpx_lt_u32_e64 v47, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_11
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_mov_b32 s55, 0
; CHECK-NEXT: ds_read_u8 v56, v0
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB1_8
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_7: ; %.114
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64
; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_cbranch_execz .LBB1_10
; CHECK-NEXT: .LBB1_8: ; %.103
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v47
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s64, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_7
; CHECK-NEXT: ; %bb.9: ; %.110
Expand All @@ -958,7 +957,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v47
; CHECK-NEXT: ds_write_b32 v0, v46
; CHECK-NEXT: s_branch .LBB1_7
; CHECK-NEXT: .LBB1_10: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
}

; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
; GFX908: NumSgprs: 64
; GFX908-GCNTRACKERS: NumSgprs: 64
; GFX908: NumSgprs: 56
; GFX908-GCNTRACKERS: NumSgprs: 56
; GFX908: NumVgprs: 43
; GFX908-GCNTRACKERS: NumVgprs: 39
; GFX908: Occupancy: 5
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,14 @@ entry:

a:
%v2 = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0)
%v3 = fadd <2 x float> %v1, %v2
%v20 = extractelement <2 x float> %v2, i32 0
%v21 = extractelement <2 x float> %v2, i32 1
%cond2 = fcmp ult float %v20, %v21
br i1 %cond2, label %b, label %c

b:
ret <2 x float> %v2
ret <2 x float> %v3

c:
%v4 = fadd <2 x float> %v1, %v1
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ bb:
%tmp20 = extractelement <4 x float> %tmp18, i32 1
%tmp21 = extractelement <4 x float> %tmp18, i32 2
%tmp22 = extractelement <4 x float> %tmp18, i32 3
%tmp23 = bitcast float %tmp14 to i32
%tmp23 = fadd float %tmp14, %tmp22
%tmp24 = bitcast float %tmp23 to i32
br label %bb24

bb24: ; preds = %bb157, %bb
Expand Down Expand Up @@ -218,7 +219,7 @@ bb156: ; preds = %bb24
bb157: ; preds = %bb24
%tmp158 = bitcast float %tmp107 to i32
%tmp159 = bitcast float %tmp107 to i32
%tmp160 = add i32 %tmp23, %tmp159
%tmp160 = add i32 %tmp24, %tmp159
%tmp161 = bitcast i32 %tmp160 to float
%tmp162 = insertelement <128 x float> poison, float %tmp103, i32 0
%tmp163 = insertelement <128 x float> %tmp162, float %tmp102, i32 1
Expand Down
Loading
Loading