Skip to content

Commit 6e3be02

Browse files
committed
[AMDGPU] introduce S_WAITCNT_FENCE_soft emitted by memory legalizer
The new instruction represents any wait counts resulting from a fence, which the memory legalizer cannot determine on its own. After lowering a fence to the appropriate cache operations and any necessary S_WAIT* instructions, the legalizer hands over further work to SIInsertWaitcnts using this instruction as a placeholder. The waitcnt inserter can use additional information. Currently this is used to implement efficient waitcnts for direct loads to LDS, based on the ordering, scope and addrspace specified on the soft fence.
1 parent dbfe5cc commit 6e3be02

File tree

8 files changed

+324
-41
lines changed

8 files changed

+324
-41
lines changed

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
1111
#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
1212

13+
#include "llvm/ADT/BitmaskEnum.h"
1314
#include "llvm/MC/MCInstrDesc.h"
1415

1516
namespace llvm {
@@ -419,6 +420,38 @@ enum CPol {
419420

420421
} // namespace CPol
421422

423+
/// The atomic synchronization scopes supported by the AMDGPU target.
424+
enum class SIAtomicScope {
425+
NONE,
426+
SINGLETHREAD,
427+
WAVEFRONT,
428+
WORKGROUP,
429+
AGENT,
430+
SYSTEM
431+
};
432+
433+
/// The distinct address spaces supported by the AMDGPU target for
434+
/// atomic memory operation. Can be ORed together.
435+
enum class SIAtomicAddrSpace {
436+
NONE = 0u,
437+
GLOBAL = 1u << 0,
438+
LDS = 1u << 1,
439+
SCRATCH = 1u << 2,
440+
GDS = 1u << 3,
441+
OTHER = 1u << 4,
442+
443+
/// The address spaces that can be accessed by a FLAT instruction.
444+
FLAT = GLOBAL | LDS | SCRATCH,
445+
446+
/// The address spaces that support atomic instructions.
447+
ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
448+
449+
/// All address spaces.
450+
ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
451+
452+
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
453+
};
454+
422455
namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
423456

424457
enum Id { // Message ID, width(4) [3:0].

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@
3737
#include "llvm/CodeGen/MachinePostDominators.h"
3838
#include "llvm/Support/DebugCounter.h"
3939
#include "llvm/TargetParser/TargetParser.h"
40+
4041
using namespace llvm;
42+
using namespace AMDGPU;
4143

4244
#define DEBUG_TYPE "si-insert-waitcnts"
4345

@@ -1381,6 +1383,32 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13811383
Modified = true;
13821384
} else
13831385
WaitcntInstr = &II;
1386+
} else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) {
1387+
// Each direct load to LDS is also a store to LDS, but we do not have a
1388+
// separate counter for it. Instead these operations increment LOAD_CNT
1389+
// and need to be waited for at a release fence. So we treat a release
1390+
// fence as if it depends on any previous LDS DMA stores.
1391+
unsigned Ordering =
1392+
TII->getNamedOperand(II, AMDGPU::OpName::Ordering)->getImm();
1393+
unsigned Scope =
1394+
TII->getNamedOperand(II, AMDGPU::OpName::Scope)->getImm();
1395+
unsigned AddrSpace =
1396+
TII->getNamedOperand(II, AMDGPU::OpName::AddrSpace)->getImm();
1397+
if (isReleaseOrStronger((AtomicOrdering)Ordering) &&
1398+
Scope >= (unsigned)AMDGPU::SIAtomicScope::WORKGROUP &&
1399+
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
1400+
LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_FENCE_soft: " << II
1401+
<< "Before: " << Wait.LoadCnt << '\n';);
1402+
ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1403+
LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
1404+
}
1405+
// It is possible (but unlikely) that this is the only wait instruction,
1406+
// in which case, we exit this loop without a WaitcntInstr to consume
1407+
// `Wait`. But that works because `Wait` was passed in by reference, and
1408+
// the callee eventually calls createNewWaitcnt on it. We test this
1409+
// possibility in an articial MIR test since such a situation cannot be
1410+
// recreated by running the memory legalizer.
1411+
II.eraseFromParent();
13841412
} else {
13851413
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
13861414
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1580,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15521580
ScoreBrackets.simplifyWaitcnt(OldWait);
15531581
Wait = Wait.combined(OldWait);
15541582
UpdatableInstr = &CombinedStoreDsCntInstr;
1583+
} else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) {
1584+
// Architectures higher than GFX10 do not have direct loads to
1585+
// LDS, so no work required here yet.
1586+
II.eraseFromParent();
1587+
continue;
15551588
} else {
15561589
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
15571590
assert(CT.has_value());
@@ -2444,6 +2477,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
24442477
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
24452478
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
24462479
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2480+
Opcode == AMDGPU::S_WAITCNT_FENCE_soft ||
24472481
counterTypeForInstr(Opcode).has_value();
24482482
}
24492483

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 39 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -57,38 +57,6 @@ enum class Position {
5757
AFTER
5858
};
5959

60-
/// The atomic synchronization scopes supported by the AMDGPU target.
61-
enum class SIAtomicScope {
62-
NONE,
63-
SINGLETHREAD,
64-
WAVEFRONT,
65-
WORKGROUP,
66-
AGENT,
67-
SYSTEM
68-
};
69-
70-
/// The distinct address spaces supported by the AMDGPU target for
71-
/// atomic memory operation. Can be ORed together.
72-
enum class SIAtomicAddrSpace {
73-
NONE = 0u,
74-
GLOBAL = 1u << 0,
75-
LDS = 1u << 1,
76-
SCRATCH = 1u << 2,
77-
GDS = 1u << 3,
78-
OTHER = 1u << 4,
79-
80-
/// The address spaces that can be accessed by a FLAT instruction.
81-
FLAT = GLOBAL | LDS | SCRATCH,
82-
83-
/// The address spaces that support atomic instructions.
84-
ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
85-
86-
/// All address spaces.
87-
ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
88-
89-
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
90-
};
91-
9260
class SIMemOpInfo final {
9361
private:
9462

@@ -1160,6 +1128,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11601128
Changed = true;
11611129
}
11621130

1131+
// Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
1132+
// later add additional waits. To minimize clutter, we do this only when
1133+
// required. For now this just means a release operation at workgroup scope
1134+
// that synchronizes LDS, required by direct loads to LDS.
1135+
if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
1136+
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
1137+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
1138+
.addImm((unsigned)Order)
1139+
.addImm((unsigned)Scope)
1140+
.addImm((unsigned)AddrSpace);
1141+
Changed = true;
1142+
}
1143+
11631144
if (Pos == Position::AFTER)
11641145
--MI;
11651146

@@ -2068,6 +2049,19 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20682049
Changed = true;
20692050
}
20702051

2052+
// Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
2053+
// later add additional waits. To minimize clutter, we do this only when
2054+
// required. For now this just means a release operation at workgroup scope
2055+
// that synchronizes LDS, required by direct loads to LDS.
2056+
if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
2057+
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
2058+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
2059+
.addImm((unsigned)Order)
2060+
.addImm((unsigned)Scope)
2061+
.addImm((unsigned)AddrSpace);
2062+
Changed = true;
2063+
}
2064+
20712065
if (VSCnt) {
20722066
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
20732067
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -2385,6 +2379,19 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23852379
Changed = true;
23862380
}
23872381

2382+
// Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
2383+
// later add additional waits. To minimize clutter, we do this only when
2384+
// required. For now this just means a release operation at workgroup scope
2385+
// that synchronizes LDS, required by direct loads to LDS.
2386+
if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
2387+
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
2388+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
2389+
.addImm((unsigned)Order)
2390+
.addImm((unsigned)Scope)
2391+
.addImm((unsigned)AddrSpace);
2392+
Changed = true;
2393+
}
2394+
23882395
if (Pos == Position::AFTER)
23892396
--MI;
23902397

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,12 @@ let OtherPredicates = [HasImageInsts] in {
16211621
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16221622
}
16231623

1624+
def S_WAITCNT_FENCE_soft : SPseudoInstSI <
1625+
(outs), (ins i32imm:$Ordering, i32imm:$Scope, i32imm:$AddrSpace)> {
1626+
let hasSideEffects = 0;
1627+
let UseNamedOperandTable = 1;
1628+
}
1629+
16241630
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
16251631
[(int_amdgcn_s_sethalt timm:$simm16)]>;
16261632
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;

0 commit comments

Comments
 (0)