From 427d6c44d65132d3b4680c257b87e5e667df642b Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 4 Dec 2025 14:50:42 +0100 Subject: [PATCH 1/2] [AMDGPU][SIInsertWaitcnts] Wait on all LDS DMA operations when no aliasing store is found Previously, we would miss inserting a wait if the ds_read had AA info, but it didn't match any LDS DMA op. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 18 +++++++++++++----- llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll | 5 ++++- llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 79c3394b2df50..69fae98d4a13f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2007,18 +2007,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // LOAD_CNT is only relevant to vgpr or LDS. unsigned RegNo = FIRST_LDS_VGPR; + bool FoundAliasingStore = false; if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { - if (MI.mayAlias(AA, *LDSDMAStores[I], true)) + if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { + FoundAliasingStore = true; ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); + } } - } else { - ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); } - if (Memop->isStore()) { + + // TODO?: Is it possible to have cases where we'd alias with both a + // store tracked in LDSDMAStores, and one that isn't ? If so, the + // current system would only wait on the tracked store, and not the + // "generic" entry. + if (!FoundAliasingStore) + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + + if (Memop->isStore()) ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); - } } // Loop over use and def operands. diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 37ba1f42413c9..1cf0fa2768de6 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -223,6 +223,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792 ; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048 ; GFX9-NEXT: ; wave barrier ; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304 @@ -288,6 +289,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792 ; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048 ; GFX10-NEXT: ; wave barrier ; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304 @@ -374,7 +376,7 @@ define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) noc ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshl_b32 s0, s3, 2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v0, v0 offset:512 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v1, v1 offset:768 @@ -397,6 +399,7 @@ define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) noc ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshl_b32 s0, s3, 2 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_read_b32 v0, v0 offset:512 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) ; GFX10-NEXT: ds_read_b32 v1, v1 offset:768 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll index a00aca34252b1..7f975549fc55e 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll @@ -23,6 +23,7 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add ; CHECK-NEXT: ; sched_barrier mask(0x00000000) ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: v_mov_b32_e32 v2, s3 +; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: ds_write_b32 v1, v3 ; CHECK-NEXT: ds_write_b32 v2, v3 ; CHECK-NEXT: ; sched_barrier mask(0x00000000) From 0551da505eafc6231c4ce7aeba733474e1d677f1 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 4 Dec 2025 15:53:15 +0100 Subject: [PATCH 2/2] Update fix --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 26 +++++++++++--------- llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll | 3 +-- llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll | 1 - 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 69fae98d4a13f..0e9d00b9165de 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1086,13 +1086,17 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } } - if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1) + if (Slot) break; + // The slot may not be valid because it can be >= NUM_LDS_VGPRS which + // means the scoreboard cannot track it. We still want to preserve the + // MI in order to check alias information, though. LDSDMAStores.push_back(&Inst); Slot = LDSDMAStores.size(); break; } - setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); + if (Slot < NUM_LDS_VGPRS) + setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); if (Slot) setRegScore(FIRST_LDS_VGPR, T, CurrScore); } @@ -2007,23 +2011,23 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // LOAD_CNT is only relevant to vgpr or LDS. unsigned RegNo = FIRST_LDS_VGPR; - bool FoundAliasingStore = false; if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { - FoundAliasingStore = true; + if ((I + 1) >= NUM_LDS_VGPRS) { + // We didn't have enough slot to track this LDS DMA store, it + // has been tracked using the common RegNo (FIRST_LDS_VGPR). + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + break; + } + ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); } } - } - - // TODO?: Is it possible to have cases where we'd alias with both a - // store tracked in LDSDMAStores, and one that isn't ? If so, the - // current system would only wait on the tracked store, and not the - // "generic" entry. - if (!FoundAliasingStore) + } else { ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + } if (Memop->isStore()) ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 1cf0fa2768de6..74513ec9106bc 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -376,7 +376,7 @@ define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) noc ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshl_b32 s0, s3, 2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_read_b32 v0, v0 offset:512 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v1, v1 offset:768 @@ -399,7 +399,6 @@ define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) noc ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshl_b32 s0, s3, 2 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_read_b32 v0, v0 offset:512 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) ; GFX10-NEXT: ds_read_b32 v1, v1 offset:768 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll index 7f975549fc55e..a00aca34252b1 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll @@ -23,7 +23,6 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add ; CHECK-NEXT: ; sched_barrier mask(0x00000000) ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: v_mov_b32_e32 v2, s3 -; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: ds_write_b32 v1, v3 ; CHECK-NEXT: ds_write_b32 v2, v3 ; CHECK-NEXT: ; sched_barrier mask(0x00000000)