diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ca8e3244edd15..674715e60de25 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1278,6 +1278,23 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( if (Opcode == AMDGPU::S_WAITCNT) { unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); + + // These pseudo waitcnt instructions are only needed to synchronize DS + // operations with direct LDS loads that use vmcnt. We can safely relax + // them when no outstanding direct LDS loads exist, even if other vmcnt + // events are pending. + if (II.getOpcode() == AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft && + TrySimplify) { + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + AMDGPU::Waitcnt LDSDirectWait; + ScoreBrackets.determineWait(LOAD_CNT, RegNo, LDSDirectWait); + // Relax waitcnt to only wait on inflight direct LDS loads. + if (LDSDirectWait.LoadCnt > OldWait.LoadCnt) { + OldWait.LoadCnt = LDSDirectWait.LoadCnt; + Modified = true; + } + } + if (TrySimplify) ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 64ab064a75f44..1465fed5315e8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1010,6 +1010,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { switch (Opcode) { case AMDGPU::S_WAITCNT_soft: + case AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft: return AMDGPU::S_WAITCNT; case AMDGPU::S_WAITCNT_VSCNT_soft: return AMDGPU::S_WAITCNT_VSCNT; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 40b3dfb94ce2f..46a641704bfeb 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1608,6 +1608,12 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">; def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Soft waitcnt for direct loads to LDS from global memory. These waits may be +// relaxed or removed entirely based on current in-flight memory operations +// and their relation to these direct LDS loads. For example, if global loads +// to LDS are mixed with global loads not writing to LDS, a wait may only be +// necessary for the LDS-writing loads to synchronize with other LDS operations. +def S_WAITCNT_DIRECT_LDS_LOAD_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">; def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir index 21372c06d3223..6c2854b67b134 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir @@ -117,3 +117,173 @@ body: | S_ENDPGM 0 ... + +# Soft waitcnt should be honored here. +# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: S_BARRIER +--- +name: buffer_load_dword_lds_ds_read_soft_wait +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# No need for waitcnt. +# GCN-LABEL: name: buffer_store_lds_dword_ds_read_soft_wait +# GCN: BUFFER_STORE_LDS_DWORD +# GCN-NEXT: S_BARRIER +--- +name: buffer_store_lds_dword_ds_read_soft_wait +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# Soft waitcnt should mean vmcnt(1) before the barrier and vmcnt(0) after. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3953 +# vmcnt(1) +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# No waitcnt before the barrier because counter is too high +# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait_redundant +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: buffer_load_dword_lds_ds_read_soft_wait_redundant +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# Combine waitcnt. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3953 +# vmcnt(1) +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_BARRIER + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# Merge waitcnt. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3953 +# vmcnt(1) +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3954 + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_BARRIER + S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + + +# Handle the preexisting waitcnt. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 0 +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_BARRIER + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + S_WAITCNT_DIRECT_LDS_LOAD_soft 3953 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +...