diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 79c3394b2df50..0e9d00b9165de 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1086,13 +1086,17 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } } - if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1) + if (Slot) break; + // The slot may not be valid because it can be >= NUM_LDS_VGPRS which + // means the scoreboard cannot track it. We still want to preserve the + // MI in order to check alias information, though. LDSDMAStores.push_back(&Inst); Slot = LDSDMAStores.size(); break; } - setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); + if (Slot < NUM_LDS_VGPRS) + setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); if (Slot) setRegScore(FIRST_LDS_VGPR, T, CurrScore); } @@ -2010,15 +2014,23 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { - if (MI.mayAlias(AA, *LDSDMAStores[I], true)) + if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { + if ((I + 1) >= NUM_LDS_VGPRS) { + // We didn't have enough slot to track this LDS DMA store, it + // has been tracked using the common RegNo (FIRST_LDS_VGPR). + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + break; + } + ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); + } } } else { ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); } - if (Memop->isStore()) { + + if (Memop->isStore()) ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); - } } // Loop over use and def operands. diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 37ba1f42413c9..74513ec9106bc 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -223,6 +223,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792 ; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048 ; GFX9-NEXT: ; wave barrier ; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304 @@ -288,6 +289,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792 ; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048 ; GFX10-NEXT: ; wave barrier ; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304