Skip to content

Commit 98bad91

Browse files
author
git apple-llvm automerger
committed
Merge commit '2e3fa4ba9e0b' from llvm.org/main into next
2 parents 82a77c1 + 2e3fa4b commit 98bad91

File tree

7 files changed

+125
-10
lines changed

7 files changed

+125
-10
lines changed

llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,9 @@ bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
328328

329329
// taken from SIInstrInfo::isAlwaysGDS()
330330
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
331-
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
331+
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
332+
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
333+
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
332334
}
333335

334336
} // namespace llvm::mca

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,16 @@ class WaitcntBrackets {
349349
LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
350350
}
351351

352+
bool hasPendingGDS() const {
353+
return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
354+
}
355+
356+
unsigned getPendingGDSWait() const {
357+
return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
358+
}
359+
360+
void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
361+
352362
// Return true if there might be pending writes to the vgpr-interval by VMEM
353363
// instructions with types different from V.
354364
bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
@@ -427,6 +437,8 @@ class WaitcntBrackets {
427437
unsigned PendingEvents = 0;
428438
// Remember the last flat memory operation.
429439
unsigned LastFlat[NUM_INST_CNTS] = {0};
440+
// Remember the last GDS operation.
441+
unsigned LastGDS = 0;
430442
// wait_cnt scores for every vgpr.
431443
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
432444
int VgprUB = -1;
@@ -729,6 +741,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
729741
MachineInstr *OldWaitcntInstr);
730742
void updateEventWaitcntAfter(MachineInstr &Inst,
731743
WaitcntBrackets *ScoreBrackets);
744+
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
745+
MachineBasicBlock *Block) const;
746+
bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
747+
WaitcntBrackets &ScoreBrackets);
732748
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
733749
WaitcntBrackets &ScoreBrackets);
734750
};
@@ -1682,6 +1698,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16821698
}
16831699
}
16841700

1701+
// Wait for any pending GDS instruction to complete before any
1702+
// "Always GDS" instruction.
1703+
if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1704+
addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1705+
16851706
if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
16861707
// The function is going to insert a wait on everything in its prolog.
16871708
// This still needs to be careful if the call target is a load (e.g. a GOT
@@ -1986,6 +2007,64 @@ static bool isCacheInvOrWBInst(MachineInstr &Inst) {
19862007
Opc == AMDGPU::GLOBAL_WBINV;
19872008
}
19882009

2010+
// Return true if the next instruction is S_ENDPGM, following fallthrough
2011+
// blocks if necessary.
2012+
bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2013+
MachineBasicBlock *Block) const {
2014+
auto BlockEnd = Block->getParent()->end();
2015+
auto BlockIter = Block->getIterator();
2016+
2017+
while (true) {
2018+
if (It.isEnd()) {
2019+
if (++BlockIter != BlockEnd) {
2020+
It = BlockIter->instr_begin();
2021+
continue;
2022+
}
2023+
2024+
return false;
2025+
}
2026+
2027+
if (!It->isMetaInstruction())
2028+
break;
2029+
2030+
It++;
2031+
}
2032+
2033+
assert(!It.isEnd());
2034+
2035+
return It->getOpcode() == AMDGPU::S_ENDPGM;
2036+
}
2037+
2038+
// Add a wait after an instruction if architecture requirements mandate one.
2039+
bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2040+
MachineBasicBlock &Block,
2041+
WaitcntBrackets &ScoreBrackets) {
2042+
AMDGPU::Waitcnt Wait;
2043+
bool NeedsEndPGMCheck = false;
2044+
2045+
if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2046+
Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2047+
!SIInstrInfo::isAtomicRet(Inst));
2048+
2049+
if (TII->isAlwaysGDS(Inst.getOpcode())) {
2050+
Wait.DsCnt = 0;
2051+
NeedsEndPGMCheck = true;
2052+
}
2053+
2054+
ScoreBrackets.simplifyWaitcnt(Wait);
2055+
2056+
auto SuccessorIt = std::next(Inst.getIterator());
2057+
bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2058+
/*OldWaitcntInstr=*/nullptr);
2059+
2060+
if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2061+
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2062+
.addImm(0);
2063+
}
2064+
2065+
return Result;
2066+
}
2067+
19892068
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19902069
WaitcntBrackets *ScoreBrackets) {
19912070
// Now look at the instruction opcode. If it is a memory access
@@ -1998,6 +2077,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19982077
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
19992078
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
20002079
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2080+
ScoreBrackets->setPendingGDS();
20012081
} else {
20022082
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
20032083
}
@@ -2128,6 +2208,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
21282208

21292209
StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
21302210

2211+
if (T == DS_CNT)
2212+
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2213+
21312214
for (int J = 0; J <= VgprUB; J++)
21322215
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
21332216

@@ -2253,13 +2336,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
22532336

22542337
updateEventWaitcntAfter(Inst, &ScoreBrackets);
22552338

2256-
if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2257-
AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2258-
Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2259-
ScoreBrackets.simplifyWaitcnt(Wait);
2260-
Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2261-
ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2262-
}
2339+
Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
22632340

22642341
LLVM_DEBUG({
22652342
Inst.print(dbgs());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4239,7 +4239,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
42394239
}
42404240

42414241
bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
4242-
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4242+
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4243+
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4244+
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
42434245
}
42444246

42454247
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
2+
3+
---
4+
# GCN-LABEL: name: test_ordered_count
5+
# GCN: bb.0
6+
# GCN: DS_ADD_U32
7+
# GCN: DS_SUB_U32
8+
# GCN-NEXT: S_WAITCNT 64535
9+
# GCN-NEXT: $vgpr3 = DS_ORDERED_COUNT
10+
# GCN-NEXT: S_WAITCNT 64519
11+
# GCN-NEXT: $vgpr4_vgpr5 = DS_ADD_GS_REG_RTN
12+
# GCN-NEXT: S_WAITCNT 64519
13+
# GCN-NEXT: S_NOP 0
14+
15+
name: test_ordered_count
16+
body: |
17+
bb.0:
18+
liveins: $vgpr0, $vgpr1, $vgpr2
19+
20+
DS_ADD_U32 $vgpr1, $vgpr2, 12, -1, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
21+
DS_SUB_U32 $vgpr1, $vgpr2, 12, 0, implicit $m0, implicit $exec :: (load store (s32), addrspace 2)
22+
$vgpr3 = DS_ORDERED_COUNT $vgpr0, 772, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
23+
$vgpr4_vgpr5 = DS_ADD_GS_REG_RTN $vgpr0, 32, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
24+
S_ENDPGM 0
25+
26+
...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ define amdgpu_gs void @test_add_32(i32 %arg) {
99
; CHECK-LABEL: test_add_32:
1010
; CHECK: ; %bb.0:
1111
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: s_nop 0
1214
; CHECK-NEXT: s_endpgm
1315
%unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
1416
ret void
@@ -30,6 +32,8 @@ define amdgpu_gs void @test_add_64(i32 %arg) {
3032
; CHECK-LABEL: test_add_64:
3133
; CHECK: ; %bb.0:
3234
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds
35+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
36+
; CHECK-NEXT: s_nop 0
3337
; CHECK-NEXT: s_endpgm
3438
%unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
3539
ret void

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value)
2626
; GCN: s_mov_b32 m0, s0
2727
; VIGFX9-NEXT: s_nop 0
2828
; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[VALUE]] offset:4868 gds
29+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
2930
; GCN-NEXT: [[BB]]:
3031
; // Wait for expcnt(0) before modifying EXEC
3132
; GCN-NEXT: s_waitcnt expcnt(0)
3233
; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]]
33-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
3434
define amdgpu_cs float @ds_ordered_swap_conditional(ptr addrspace(2) inreg %gds, i32 %value) {
3535
entry:
3636
%c = icmp ne i32 %value, 0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ define amdgpu_gs void @test_sub_32(i32 %arg) {
99
; CHECK-LABEL: test_sub_32:
1010
; CHECK: ; %bb.0:
1111
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: s_nop 0
1214
; CHECK-NEXT: s_endpgm
1315
%unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
1416
ret void
@@ -30,6 +32,8 @@ define amdgpu_gs void @test_sub_64(i32 %arg) {
3032
; CHECK-LABEL: test_sub_64:
3133
; CHECK: ; %bb.0:
3234
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds
35+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
36+
; CHECK-NEXT: s_nop 0
3337
; CHECK-NEXT: s_endpgm
3438
%unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
3539
ret void

0 commit comments

Comments
 (0)