Skip to content

Commit 5e4505d

Browse files
authored
[AMDGPU][SIInsertWaitCnts] Gfx12.5 - Refactor xcnt optimization (#164357)
Refactor the XCnt optimization checks so that they can be checked when applying a pre-existing waitcnt. This removes unnecessary xcnt waits when taking a loop backedge.
1 parent 751a943 commit 5e4505d

13 files changed

+90
-110
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 52 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ class SIInsertWaitcnts {
531531
// instruction.
532532
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
533533
switch (Inst.getOpcode()) {
534+
// FIXME: GLOBAL_INV needs to be tracked with xcnt too.
534535
case AMDGPU::GLOBAL_INV:
535536
return VMEM_READ_ACCESS; // tracked using loadcnt
536537
case AMDGPU::GLOBAL_WB:
@@ -633,8 +634,11 @@ class WaitcntBrackets {
633634
const MachineOperand &Op) const;
634635

635636
bool counterOutOfOrder(InstCounterType T) const;
636-
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
637+
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
637638
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
639+
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
640+
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
641+
void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
638642

639643
void determineWait(InstCounterType T, RegInterval Interval,
640644
AMDGPU::Waitcnt &Wait) const;
@@ -646,7 +650,6 @@ class WaitcntBrackets {
646650

647651
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
648652
void applyWaitcnt(InstCounterType T, unsigned Count);
649-
void applyXcnt(const AMDGPU::Waitcnt &Wait);
650653
void updateByEvent(WaitEventType E, MachineInstr &MI);
651654

652655
unsigned hasPendingEvent() const { return PendingEvents; }
@@ -1192,15 +1195,15 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11921195

11931196
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
11941197
/// whether a waitcnt instruction is needed at all.
1195-
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1198+
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
11961199
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
11971200
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
11981201
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
11991202
simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
12001203
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
12011204
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
12021205
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1203-
simplifyWaitcnt(X_CNT, Wait.XCnt);
1206+
simplifyXcnt(Wait, Wait);
12041207
}
12051208

12061209
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1270,7 +1273,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
12701273
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
12711274
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
12721275
applyWaitcnt(KM_CNT, Wait.KmCnt);
1273-
applyXcnt(Wait);
1276+
applyWaitcnt(X_CNT, Wait.XCnt);
12741277
}
12751278

12761279
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1287,35 +1290,42 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12871290
}
12881291
}
12891292

1290-
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1291-
// On entry to a block with multiple predescessors, there may
1292-
// be pending SMEM and VMEM events active at the same time.
1293-
// In such cases, only clear one active event at a time.
1294-
1293+
bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
12951294
// Wait on XCNT is redundant if we are already waiting for a load to complete.
12961295
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
12971296
// zero.
1298-
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
1299-
if (!hasMixedPendingEvents(X_CNT))
1300-
applyWaitcnt(X_CNT, 0);
1301-
else
1302-
PendingEvents &= ~(1 << SMEM_GROUP);
1303-
return;
1304-
}
1297+
return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1298+
}
13051299

1300+
bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
13061301
// If we have pending store we cannot optimize XCnt because we do not wait for
13071302
// stores. VMEM loads retun in order, so if we only have loads XCnt is
13081303
// decremented to the same number as LOADCnt.
1309-
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1310-
!hasPendingEvent(STORE_CNT)) {
1311-
if (!hasMixedPendingEvents(X_CNT))
1312-
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1313-
else if (Wait.LoadCnt == 0)
1304+
return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1305+
!hasPendingEvent(STORE_CNT);
1306+
}
1307+
1308+
void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
1309+
AMDGPU::Waitcnt &UpdateWait) {
1310+
// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1311+
// optimizations. On entry to a block with multiple predescessors, there may
1312+
// be pending SMEM and VMEM events active at the same time.
1313+
// In such cases, only clear one active event at a time.
1314+
// TODO: Revisit xcnt optimizations for gfx1250.
1315+
if (hasRedundantXCntWithKmCnt(CheckWait)) {
1316+
if (!hasMixedPendingEvents(X_CNT)) {
1317+
applyWaitcnt(X_CNT, 0);
1318+
} else {
1319+
PendingEvents &= ~(1 << SMEM_GROUP);
1320+
}
1321+
} else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
1322+
if (!hasMixedPendingEvents(X_CNT)) {
1323+
applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
1324+
} else if (CheckWait.LoadCnt == 0) {
13141325
PendingEvents &= ~(1 << VMEM_GROUP);
1315-
return;
1326+
}
13161327
}
1317-
1318-
applyWaitcnt(X_CNT, Wait.XCnt);
1328+
simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
13191329
}
13201330

13211331
// Where there are multiple types of event in the bracket of a counter,
@@ -1650,6 +1660,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
16501660
}
16511661
}
16521662

1663+
// Save the pre combine waitcnt in order to make xcnt checks.
1664+
AMDGPU::Waitcnt PreCombine = Wait;
16531665
if (CombinedLoadDsCntInstr) {
16541666
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
16551667
// to be waited for. Otherwise, let the instruction be deleted so
@@ -1740,6 +1752,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17401752
}
17411753

17421754
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1755+
if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
1756+
(CT == LOAD_CNT &&
1757+
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
1758+
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1759+
// due to taking the backedge of a block.
1760+
ScoreBrackets.simplifyXcnt(PreCombine, Wait);
1761+
}
17431762
if (!WaitInstrs[CT])
17441763
continue;
17451764

@@ -2086,6 +2105,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20862105
// Verify that the wait is actually needed.
20872106
ScoreBrackets.simplifyWaitcnt(Wait);
20882107

2108+
// Since the translation for VMEM addresses occur in-order, we can apply the
2109+
// XCnt if the current instruction is of VMEM type and has a memory
2110+
// dependency with another VMEM instruction in flight.
2111+
if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2112+
ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2113+
Wait.XCnt = ~0u;
2114+
}
2115+
20892116
// When forcing emit, we need to skip terminators because that would break the
20902117
// terminators of the MBB if we emit a waitcnt between terminators.
20912118
if (ForceEmitZeroFlag && !MI.isTerminator())
@@ -2154,21 +2181,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21542181
<< "Update Instr: " << *It);
21552182
}
21562183

2157-
// XCnt may be already consumed by a load wait.
2158-
if (Wait.XCnt != ~0u) {
2159-
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2160-
Wait.XCnt = ~0u;
2161-
2162-
if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2163-
Wait.XCnt = ~0u;
2164-
2165-
// Since the translation for VMEM addresses occur in-order, we can skip the
2166-
// XCnt if the current instruction is of VMEM type and has a memory
2167-
// dependency with another VMEM instruction in flight.
2168-
if (isVmemAccess(*It))
2169-
Wait.XCnt = ~0u;
2170-
}
2171-
21722184
if (WCG->createNewWaitcnt(Block, It, Wait))
21732185
Modified = true;
21742186

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,7 +1501,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
15011501
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15021502
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15031503
; GFX1250-NEXT: s_wait_storecnt 0x0
1504-
; GFX1250-NEXT: s_wait_xcnt 0x0
15051504
; GFX1250-NEXT: s_wait_kmcnt 0x0
15061505
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15071506
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1574,7 +1573,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15741573
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15751574
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
15761575
; GFX1250-NEXT: s_wait_storecnt 0x0
1577-
; GFX1250-NEXT: s_wait_xcnt 0x0
15781576
; GFX1250-NEXT: s_wait_kmcnt 0x0
15791577
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15801578
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1649,7 +1647,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
16491647
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
16501648
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
16511649
; GFX1250-NEXT: s_wait_storecnt 0x0
1652-
; GFX1250-NEXT: s_wait_xcnt 0x0
16531650
; GFX1250-NEXT: s_wait_kmcnt 0x0
16541651
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
16551652
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1722,7 +1719,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
17221719
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
17231720
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17241721
; GFX1250-NEXT: s_wait_storecnt 0x0
1725-
; GFX1250-NEXT: s_wait_xcnt 0x0
17261722
; GFX1250-NEXT: s_wait_kmcnt 0x0
17271723
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17281724
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1913,7 +1909,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
19131909
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
19141910
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
19151911
; GFX1250-NEXT: s_wait_storecnt 0x0
1916-
; GFX1250-NEXT: s_wait_xcnt 0x0
19171912
; GFX1250-NEXT: s_wait_kmcnt 0x0
19181913
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19191914
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1959,7 +1954,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
19591954
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19601955
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19611956
; GFX1250-NEXT: s_wait_storecnt 0x0
1962-
; GFX1250-NEXT: s_wait_xcnt 0x0
19631957
; GFX1250-NEXT: s_wait_kmcnt 0x0
19641958
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19651959
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2002,7 +1996,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
20021996
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20031997
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
20041998
; GFX1250-NEXT: s_wait_storecnt 0x0
2005-
; GFX1250-NEXT: s_wait_xcnt 0x0
20061999
; GFX1250-NEXT: s_wait_kmcnt 0x0
20072000
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
20082001
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2047,7 +2040,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
20472040
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20482041
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
20492042
; GFX1250-NEXT: s_wait_storecnt 0x0
2050-
; GFX1250-NEXT: s_wait_xcnt 0x0
20512043
; GFX1250-NEXT: s_wait_kmcnt 0x0
20522044
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
20532045
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2210,7 +2202,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
22102202
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
22112203
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
22122204
; GFX1250-NEXT: s_wait_storecnt 0x0
2213-
; GFX1250-NEXT: s_wait_xcnt 0x0
22142205
; GFX1250-NEXT: s_wait_kmcnt 0x0
22152206
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
22162207
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2520,6 +2520,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
25202520
; GFX1250-NEXT: s_wait_kmcnt 0x0
25212521
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
25222522
; GFX1250-NEXT: s_wait_loadcnt 0x0
2523+
; GFX1250-NEXT: s_wait_xcnt 0x0
25232524
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
25242525
; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
25252526
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2783,6 +2784,7 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
27832784
; GFX1250-NEXT: s_wait_kmcnt 0x0
27842785
; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
27852786
; GFX1250-NEXT: s_wait_loadcnt 0x0
2787+
; GFX1250-NEXT: s_wait_xcnt 0x0
27862788
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27872789
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
27882790
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2872,6 +2874,7 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
28722874
; GFX1250-NEXT: s_wait_kmcnt 0x0
28732875
; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
28742876
; GFX1250-NEXT: s_wait_loadcnt 0x0
2877+
; GFX1250-NEXT: s_wait_xcnt 0x0
28752878
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28762879
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
28772880
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
@@ -6850,6 +6853,7 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
68506853
; GFX1250-NEXT: s_wait_kmcnt 0x0
68516854
; GFX1250-NEXT: global_load_b32 v1, v[0:1], off
68526855
; GFX1250-NEXT: s_wait_loadcnt 0x0
6856+
; GFX1250-NEXT: s_wait_xcnt 0x0
68536857
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
68546858
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
68556859
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -6943,6 +6947,7 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
69436947
; GFX1250-NEXT: s_wait_kmcnt 0x0
69446948
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
69456949
; GFX1250-NEXT: s_wait_loadcnt 0x0
6950+
; GFX1250-NEXT: s_wait_xcnt 0x0
69466951
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
69476952
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
69486953
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7033,6 +7038,7 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
70337038
; GFX1250-NEXT: s_wait_kmcnt 0x0
70347039
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
70357040
; GFX1250-NEXT: s_wait_loadcnt 0x0
7041+
; GFX1250-NEXT: s_wait_xcnt 0x0
70367042
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
70377043
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
70387044
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7134,6 +7140,7 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
71347140
; GFX1250-NEXT: s_wait_kmcnt 0x0
71357141
; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
71367142
; GFX1250-NEXT: s_wait_loadcnt 0x0
7143+
; GFX1250-NEXT: s_wait_xcnt 0x0
71377144
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
71387145
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
71397146
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7251,6 +7258,7 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
72517258
; GFX1250-NEXT: s_wait_kmcnt 0x0
72527259
; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
72537260
; GFX1250-NEXT: s_wait_loadcnt 0x0
7261+
; GFX1250-NEXT: s_wait_xcnt 0x0
72547262
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
72557263
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
72567264
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
@@ -7367,6 +7375,7 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
73677375
; GFX1250-NEXT: s_wait_kmcnt 0x0
73687376
; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off
73697377
; GFX1250-NEXT: s_wait_loadcnt 0x0
7378+
; GFX1250-NEXT: s_wait_xcnt 0x0
73707379
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
73717380
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
73727381
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
@@ -8001,6 +8010,7 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
80018010
; GFX1250-NEXT: s_wait_kmcnt 0x0
80028011
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
80038012
; GFX1250-NEXT: s_wait_loadcnt 0x0
8013+
; GFX1250-NEXT: s_wait_xcnt 0x0
80048014
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
80058015
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
80068016
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -8241,6 +8251,7 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
82418251
; GFX1250-NEXT: s_wait_kmcnt 0x0
82428252
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
82438253
; GFX1250-NEXT: s_wait_loadcnt 0x0
8254+
; GFX1250-NEXT: s_wait_xcnt 0x0
82448255
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3
82458256
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
82468257
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
@@ -8377,6 +8388,7 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
83778388
; GFX1250-NEXT: s_wait_kmcnt 0x0
83788389
; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
83798390
; GFX1250-NEXT: s_wait_loadcnt 0x0
8391+
; GFX1250-NEXT: s_wait_xcnt 0x0
83808392
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3
83818393
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
83828394
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
@@ -8522,6 +8534,7 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
85228534
; GFX1250-NEXT: s_wait_kmcnt 0x0
85238535
; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
85248536
; GFX1250-NEXT: s_wait_loadcnt 0x0
8537+
; GFX1250-NEXT: s_wait_xcnt 0x0
85258538
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4
85268539
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
85278540
; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5
@@ -8693,6 +8706,7 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
86938706
; GFX1250-NEXT: s_wait_kmcnt 0x0
86948707
; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off
86958708
; GFX1250-NEXT: s_wait_loadcnt 0x0
8709+
; GFX1250-NEXT: s_wait_xcnt 0x0
86968710
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9
86978711
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
86988712
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9

llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
152152
; GCN-NEXT: s_wait_kmcnt 0x0
153153
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
154154
; GCN-NEXT: s_wait_loadcnt 0x0
155-
; GCN-NEXT: s_wait_xcnt 0x0
156155
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
157156
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
158157
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]

llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
2727
; GCN-NEXT: s_wait_dscnt 0x0
2828
; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
2929
; GCN-NEXT: s_wait_loadcnt 0x0
30-
; GCN-NEXT: s_wait_xcnt 0x0
3130
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 4, v[0:1]
3231
; GCN-NEXT: v_add_co_u32 v2, s0, v2, 1
3332
; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0

0 commit comments

Comments
 (0)