Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 51 additions & 47 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,8 +633,11 @@ class WaitcntBrackets {
const MachineOperand &Op) const;

bool counterOutOfOrder(InstCounterType T) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);

void determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const;
Expand All @@ -646,7 +649,6 @@ class WaitcntBrackets {

void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);

unsigned hasPendingEvent() const { return PendingEvents; }
Expand Down Expand Up @@ -1192,15 +1194,15 @@ void WaitcntBrackets::print(raw_ostream &OS) const {

/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
simplifyWaitcnt(X_CNT, Wait.XCnt);
simplifyXcnt(Wait, Wait);
}

void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
Expand Down Expand Up @@ -1270,7 +1272,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
applyXcnt(Wait);
applyWaitcnt(X_CNT, Wait.XCnt);
}

void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
Expand All @@ -1287,41 +1289,41 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}

void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
auto applyPendingXcntGroup = [this](unsigned E) {
unsigned LowerBound = getScoreLB(X_CNT);
applyWaitcnt(X_CNT, 0);
PendingEvents |= (1 << E);
setScoreLB(X_CNT, LowerBound);
};

bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
if (hasPendingEvent(VMEM_GROUP))
applyPendingXcntGroup(VMEM_GROUP);
else
applyWaitcnt(X_CNT, 0);
return;
}
return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
}

bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
!hasPendingEvent(STORE_CNT)) {
if (hasPendingEvent(SMEM_GROUP))
applyPendingXcntGroup(SMEM_GROUP);
else
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
return;
}
return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
!hasPendingEvent(STORE_CNT);
}

applyWaitcnt(X_CNT, Wait.XCnt);
void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
AMDGPU::Waitcnt &UpdateWait) {
// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
// optimizations. On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
if (hasRedundantXCntWithKmCnt(CheckWait)) {
if (!hasMixedPendingEvents(X_CNT)) {
applyWaitcnt(X_CNT, 0);
} else {
PendingEvents &= ~(1 << SMEM_GROUP);
}
} else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
if (!hasMixedPendingEvents(X_CNT)) {
applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
} else if (CheckWait.LoadCnt == 0) {
PendingEvents &= ~(1 << VMEM_GROUP);
}
}
simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
}

// Where there are multiple types of event in the bracket of a counter,
Expand Down Expand Up @@ -1656,6 +1658,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
}

// Save the pre combine waitcnt in order to make xcnt checks.
AMDGPU::Waitcnt PreCombine = Wait;
if (CombinedLoadDsCntInstr) {
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
// to be waited for. Otherwise, let the instruction be deleted so
Expand Down Expand Up @@ -1746,6 +1750,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}

for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
(CT == LOAD_CNT &&
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
// due to taking the backedge of a block.
ScoreBrackets.simplifyXcnt(PreCombine, Wait);
}
if (!WaitInstrs[CT])
continue;

Expand Down Expand Up @@ -2092,6 +2103,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);

// Since the translation for VMEM addresses occur in-order, we can apply the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
Wait.XCnt = ~0u;
}

// When forcing emit, we need to skip terminators because that would break the
// terminators of the MBB if we emit a waitcnt between terminators.
if (ForceEmitZeroFlag && !MI.isTerminator())
Expand Down Expand Up @@ -2160,21 +2179,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}

// XCnt may be already consumed by a load wait.
if (Wait.XCnt != ~0u) {
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
Wait.XCnt = ~0u;

if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
Wait.XCnt = ~0u;

// Since the translation for VMEM addresses occur in-order, we can skip the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
if (isVmemAccess(*It))
Wait.XCnt = ~0u;
}

Comment on lines -2157 to -2171
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nhaehnle @jayfoad Thoughts on this case (see changes in bf16.ll)? -

; GFX1250-LABEL: test_load_store_f32_to_bf16:
; GFX1250:       ; %bb.0:
; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT:    s_wait_kmcnt 0x0
; GFX1250-NEXT:    global_load_b32 v0, v[0:1], off

Debug output before the next instruction:
    LOAD_CNT(1): 0:v0 0:v1
    DS_CNT(0):
    EXP_CNT(0):
    STORE_CNT(63):
    SAMPLE_CNT(0):
    BVH_CNT(0):
    KM_CNT(0): 
    X_CNT(1): 0:v0 0:v1 0:v2 0:v3 0:s126 0:s127
Pending Events: VMEM_READ_ACCESS, VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS, VMEM_GROUP

; GFX1250-NEXT:    s_wait_loadcnt 0x0
; GFX1250-NEXT:    s_wait_xcnt 0x0
; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT:    global_store_b16 v[2:3], v0, off
; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
  %val = load float, ptr addrspace(1) %in
  %val.bf16 = fptrunc float %val to bfloat
  store bfloat %val.bf16, ptr addrspace(1) %out
  ret void
}

Previously the xcnt was not waited on because of the check on L2168, but now it also requires !hasPendingEvent(STORE_CNT). I think my change is correct because only VMEM loads return in order.

Copy link
Contributor Author

@RyanRio RyanRio Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is waiting for #166779 though (currently just includes Jay's fix in this PR.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also just have this one supersede it, up to you Jay.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently there was a late change to gfx1250 that made it so every s_wait_loadcnt (and storecnt) also waits for the equivalent xcnt value. We need to get some clarity on that internally.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. I also see that hardware waits for xcnt==0 at every branch/call, which probably means that SIInsertWaitcnts does not after all have to handle a mixture of pending SMEM Xcnts and VMEM Xcnts.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a todo to revisit.

if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;

Expand Down
9 changes: 0 additions & 9 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1501,7 +1501,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1574,7 +1573,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1649,7 +1647,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1722,7 +1719,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1913,7 +1909,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1959,7 +1954,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
Expand Down Expand Up @@ -2002,7 +1996,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
Expand Down Expand Up @@ -2047,7 +2040,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -2210,7 +2202,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
Expand Down
14 changes: 14 additions & 0 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2520,6 +2520,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
Expand Down Expand Up @@ -2783,6 +2784,7 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
Expand Down Expand Up @@ -2872,6 +2874,7 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
Expand Down Expand Up @@ -6850,6 +6853,7 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
Expand Down Expand Up @@ -6943,6 +6947,7 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
Expand Down Expand Up @@ -7033,6 +7038,7 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
Expand Down Expand Up @@ -7134,6 +7140,7 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
Expand Down Expand Up @@ -7251,6 +7258,7 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
Expand Down Expand Up @@ -7367,6 +7375,7 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
Expand Down Expand Up @@ -8001,6 +8010,7 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
Expand Down Expand Up @@ -8241,6 +8251,7 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
Expand Down Expand Up @@ -8377,6 +8388,7 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
Expand Down Expand Up @@ -8522,6 +8534,7 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5
Expand Down Expand Up @@ -8693,6 +8706,7 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 4, v[0:1]
; GCN-NEXT: v_add_co_u32 v2, s0, v2, 1
; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0
Expand Down
Loading