Skip to content

Commit 1bc7bff

Browse files
committed
[AMDGPU] Optimize waitcnt insertion for flat memory operations
Change waitcnt insertion to check the memory operand tokens to see if flat memory operations access VMEM in the same way it does to check if accessing LDS. This avoids adding waitcnt for counters for address spaces that are not accessed. In addition, only generate the pessimistic waitcnt 0 if a flat memory operation appears to access both VMEM and LDS. This benefits flat memory operations that explicitly specify the address space as GLOBAL or LOCAL. Differential Revision: https://reviews.llvm.org/D89618
1 parent 1298252 commit 1bc7bff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+584
-508
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
458458
#endif // NDEBUG
459459
}
460460

461+
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
461462
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
462463
bool generateWaitcntInstBefore(MachineInstr &MI,
463464
WaitcntBrackets &ScoreBrackets,
@@ -1194,12 +1195,50 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
11941195
return Modified;
11951196
}
11961197

1197-
// This is a flat memory operation. Check to see if it has memory
1198-
// tokens for both LDS and Memory, and if so mark it as a flat.
1198+
// This is a flat memory operation. Check to see if it has memory tokens other
1199+
// than LDS. Other address spaces supported by flat memory operations involve
1200+
// global memory.
1201+
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1202+
assert(TII->isFLAT(MI));
1203+
1204+
// All flat instructions use the VMEM counter.
1205+
assert(TII->usesVM_CNT(MI));
1206+
1207+
// If there are no memory operands then conservatively assume the flat
1208+
// operation may access VMEM.
1209+
if (MI.memoperands_empty())
1210+
return true;
1211+
1212+
// See if any memory operand specifies an address space that involves VMEM.
1213+
// Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1214+
// involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1215+
// (GDS) address space is not supported by flat operations. Therefore, simply
1216+
// return true unless only the LDS address space is found.
1217+
for (const MachineMemOperand *Memop : MI.memoperands()) {
1218+
unsigned AS = Memop->getAddrSpace();
1219+
assert(AS != AMDGPUAS::REGION_ADDRESS);
1220+
if (AS != AMDGPUAS::LOCAL_ADDRESS)
1221+
return true;
1222+
}
1223+
1224+
return false;
1225+
}
1226+
1227+
// This is a flat memory operation. Check to see if it has memory tokens for
1228+
// either LDS or FLAT.
11991229
bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1230+
assert(TII->isFLAT(MI));
1231+
1232+
// Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1233+
if (!TII->usesLGKM_CNT(MI))
1234+
return false;
1235+
1236+
// If there are no memory operands then conservatively assume the flat
1237+
// operation may access LDS.
12001238
if (MI.memoperands_empty())
12011239
return true;
12021240

1241+
// See if any memory operand specifies an address space that involves LDS.
12031242
for (const MachineMemOperand *Memop : MI.memoperands()) {
12041243
unsigned AS = Memop->getAddrSpace();
12051244
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
@@ -1226,7 +1265,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
12261265
} else if (TII->isFLAT(Inst)) {
12271266
assert(Inst.mayLoadOrStore());
12281267

1229-
if (TII->usesVM_CNT(Inst)) {
1268+
int FlatASCount = 0;
1269+
1270+
if (mayAccessVMEMThroughFlat(Inst)) {
1271+
++FlatASCount;
12301272
if (!ST->hasVscnt())
12311273
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
12321274
else if (Inst.mayLoad() &&
@@ -1236,15 +1278,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
12361278
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
12371279
}
12381280

1239-
if (TII->usesLGKM_CNT(Inst)) {
1281+
if (mayAccessLDSThroughFlat(Inst)) {
1282+
++FlatASCount;
12401283
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1241-
1242-
// This is a flat memory operation, so note it - it will require
1243-
// that both the VM and LGKM be flushed to zero if it is pending when
1244-
// a VM or LGKM dependency occurs.
1245-
if (mayAccessLDSThroughFlat(Inst))
1246-
ScoreBrackets->setPendingFlat();
12471284
}
1285+
1286+
// A Flat memory operation must access at least one address space.
1287+
assert(FlatASCount);
1288+
1289+
// This is a flat memory operation that access both VMEM and LDS, so note it
1290+
// - it will require that both the VM and LGKM be flushed to zero if it is
1291+
// pending when a VM or LGKM dependency occurs.
1292+
if (FlatASCount > 1)
1293+
ScoreBrackets->setPendingFlat();
12481294
} else if (SIInstrInfo::isVMEM(Inst) &&
12491295
// TODO: get a better carve out.
12501296
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&

llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
103103
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
104104
; VI-NEXT: flat_store_dword v[0:1], v0
105105
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
106-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
106+
; VI-NEXT: s_waitcnt vmcnt(0)
107107
; VI-NEXT: s_setpc_b64 s[30:31]
108108
%lshr.8 = lshr i32 %arg0, 8
109109
store i32 %lshr.8, i32 addrspace(1)* undef
@@ -527,7 +527,7 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
527527
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
528528
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
529529
; VI-NEXT: flat_load_ubyte v0, v[0:1]
530-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
530+
; VI-NEXT: s_waitcnt vmcnt(0)
531531
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
532532
; VI-NEXT: v_mov_b32_e32 v0, s2
533533
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -628,13 +628,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
628628
; VI-NEXT: flat_load_ubyte v3, v[6:7]
629629
; VI-NEXT: v_mov_b32_e32 v5, s3
630630
; VI-NEXT: v_mov_b32_e32 v4, s2
631-
; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
631+
; VI-NEXT: s_waitcnt vmcnt(3)
632632
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
633-
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
633+
; VI-NEXT: s_waitcnt vmcnt(2)
634634
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
635-
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
635+
; VI-NEXT: s_waitcnt vmcnt(1)
636636
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
637-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
637+
; VI-NEXT: s_waitcnt vmcnt(0)
638638
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
639639
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
640640
; VI-NEXT: s_endpgm
@@ -711,7 +711,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
711711
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
712712
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
713713
; VI-NEXT: flat_load_dword v0, v[0:1]
714-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
714+
; VI-NEXT: s_waitcnt vmcnt(0)
715715
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
716716
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
717717
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -758,7 +758,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
758758
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
759759
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
760760
; VI-NEXT: flat_load_dword v0, v[0:1]
761-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
761+
; VI-NEXT: s_waitcnt vmcnt(0)
762762
; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0
763763
; VI-NEXT: v_cvt_f32_ubyte1_e32 v2, v0
764764
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -805,7 +805,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
805805
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
806806
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
807807
; VI-NEXT: flat_load_ubyte v0, v[0:1]
808-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
808+
; VI-NEXT: s_waitcnt vmcnt(0)
809809
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
810810
; VI-NEXT: v_mov_b32_e32 v0, s2
811811
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -874,13 +874,13 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
874874
; VI-NEXT: flat_load_ubyte v3, v[6:7]
875875
; VI-NEXT: v_mov_b32_e32 v5, s3
876876
; VI-NEXT: v_mov_b32_e32 v4, s2
877-
; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
877+
; VI-NEXT: s_waitcnt vmcnt(3)
878878
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
879-
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
879+
; VI-NEXT: s_waitcnt vmcnt(2)
880880
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
881-
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
881+
; VI-NEXT: s_waitcnt vmcnt(1)
882882
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
883-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
883+
; VI-NEXT: s_waitcnt vmcnt(0)
884884
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
885885
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
886886
; VI-NEXT: s_endpgm
@@ -923,7 +923,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
923923
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
924924
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
925925
; VI-NEXT: flat_load_dword v0, v[0:1]
926-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
926+
; VI-NEXT: s_waitcnt vmcnt(0)
927927
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
928928
; VI-NEXT: v_mov_b32_e32 v0, s2
929929
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -969,7 +969,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
969969
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
970970
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
971971
; VI-NEXT: flat_load_dword v0, v[0:1]
972-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
972+
; VI-NEXT: s_waitcnt vmcnt(0)
973973
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
974974
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
975975
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -1018,7 +1018,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
10181018
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
10191019
; VI-NEXT: flat_load_dword v0, v[0:1]
10201020
; VI-NEXT: v_mov_b32_e32 v1, 0xff
1021-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1021+
; VI-NEXT: s_waitcnt vmcnt(0)
10221022
; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
10231023
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
10241024
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -1064,7 +1064,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
10641064
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
10651065
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
10661066
; VI-NEXT: flat_load_dword v0, v[0:1]
1067-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1067+
; VI-NEXT: s_waitcnt vmcnt(0)
10681068
; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0
10691069
; VI-NEXT: v_mov_b32_e32 v0, s2
10701070
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1111,7 +1111,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a
11111111
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
11121112
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
11131113
; VI-NEXT: flat_load_dword v0, v[0:1]
1114-
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1114+
; VI-NEXT: s_waitcnt vmcnt(0)
11151115
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
11161116
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
11171117
; VI-NEXT: v_add_f32_e32 v2, v0, v1

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(<4 x i128> addrspace(
7272
; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
7373
; GFX8-NEXT: s_lshl_b32 s0, s2, 1
7474
; GFX8-NEXT: s_lshl_b32 m0, s0, 1
75-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
75+
; GFX8-NEXT: s_waitcnt vmcnt(0)
7676
; GFX8-NEXT: v_movrels_b32_e32 v1, v3
7777
; GFX8-NEXT: v_movrels_b32_e32 v0, v2
7878
; GFX8-NEXT: v_mov_b32_e32 v3, v1
@@ -180,13 +180,13 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
180180
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
181181
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16
182182
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16
183-
; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
183+
; GFX8-NEXT: s_waitcnt vmcnt(1)
184184
; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v10, s[4:5]
185185
; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v11, s[4:5]
186186
; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
187187
; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
188188
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
189-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
189+
; GFX8-NEXT: s_waitcnt vmcnt(0)
190190
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
191191
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
192192
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17
@@ -206,7 +206,7 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
206206
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
207207
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
208208
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17
209-
; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
209+
; GFX8-NEXT: s_waitcnt vmcnt(1)
210210
; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc
211211
; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v9, vcc
212212
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17
@@ -219,7 +219,7 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
219219
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
220220
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
221221
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17
222-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
222+
; GFX8-NEXT: s_waitcnt vmcnt(0)
223223
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
224224
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7]
225225
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7]
@@ -577,7 +577,7 @@ define i128 @extractelement_vgpr_v4i128_idx0(<4 x i128> addrspace(1)* %ptr) {
577577
; GFX8: ; %bb.0:
578578
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
579579
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
580-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
580+
; GFX8-NEXT: s_waitcnt vmcnt(0)
581581
; GFX8-NEXT: s_setpc_b64 s[30:31]
582582
;
583583
; GFX7-LABEL: extractelement_vgpr_v4i128_idx0:
@@ -612,7 +612,7 @@ define i128 @extractelement_vgpr_v4i128_idx1(<4 x i128> addrspace(1)* %ptr) {
612612
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
613613
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
614614
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
615-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
615+
; GFX8-NEXT: s_waitcnt vmcnt(0)
616616
; GFX8-NEXT: v_mov_b32_e32 v0, v4
617617
; GFX8-NEXT: v_mov_b32_e32 v1, v5
618618
; GFX8-NEXT: v_mov_b32_e32 v2, v6
@@ -655,7 +655,7 @@ define i128 @extractelement_vgpr_v4i128_idx2(<4 x i128> addrspace(1)* %ptr) {
655655
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
656656
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
657657
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
658-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
658+
; GFX8-NEXT: s_waitcnt vmcnt(0)
659659
; GFX8-NEXT: v_mov_b32_e32 v0, v8
660660
; GFX8-NEXT: v_mov_b32_e32 v1, v9
661661
; GFX8-NEXT: v_mov_b32_e32 v2, v10
@@ -698,7 +698,7 @@ define i128 @extractelement_vgpr_v4i128_idx3(<4 x i128> addrspace(1)* %ptr) {
698698
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
699699
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
700700
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
701-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
701+
; GFX8-NEXT: s_waitcnt vmcnt(0)
702702
; GFX8-NEXT: v_mov_b32_e32 v0, v12
703703
; GFX8-NEXT: v_mov_b32_e32 v1, v13
704704
; GFX8-NEXT: v_mov_b32_e32 v2, v14

0 commit comments

Comments
 (0)