Skip to content

Commit d748c81

Browse files
authored
[AMDGPU] Change the immediate operand of s_waitcnt_depctr / s_wait_alu (#169378)
The 16-bit immediate operand of s_waitcnt_depctr / s_wait_alu has some unused bits. Previously codegen would set these bits to 1, but setting them to 0 matches the SP3 assembler behaviour better, which in turn means that we can print them using the human readable SP3 syntax: s_wait_alu 0xfffd ; unused bits set to 1 s_wait_alu 0xff9d ; unused bits set to 0 s_wait_alu depctr_va_vcc(0) ; unused bits set to 0, human readable Note that the set of unused bits changed between GFX10.1 and GFX10.3.
1 parent 105900c commit d748c81

File tree

220 files changed

+9546
-9537
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

220 files changed

+9546
-9537
lines changed

llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ namespace {
4444

4545
class AMDGPUWaitSGPRHazards {
4646
public:
47+
const GCNSubtarget *ST;
4748
const SIInstrInfo *TII;
4849
const SIRegisterInfo *TRI;
4950
const MachineRegisterInfo *MRI;
@@ -165,7 +166,7 @@ class AMDGPUWaitSGPRHazards {
165166
}
166167

167168
unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
168-
unsigned Mask = 0xffff;
169+
unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
169170
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
170171
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
171172
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
@@ -387,7 +388,7 @@ class AMDGPUWaitSGPRHazards {
387388

388389
// Apply wait
389390
if (Wait) {
390-
unsigned Mask = 0xffff;
391+
unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
391392
if (Wait & WA_VCC) {
392393
State.VCCHazard &= ~HazardState::VALU;
393394
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
@@ -438,8 +439,8 @@ class AMDGPUWaitSGPRHazards {
438439
}
439440

440441
bool run(MachineFunction &MF) {
441-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
442-
if (!ST.hasVALUReadSGPRHazard())
442+
ST = &MF.getSubtarget<GCNSubtarget>();
443+
if (!ST->hasVALUReadSGPRHazard())
443444
return false;
444445

445446
// Parse settings
@@ -467,10 +468,10 @@ class AMDGPUWaitSGPRHazards {
467468
if (!EnableSGPRHazardWaits)
468469
return false;
469470

470-
TII = ST.getInstrInfo();
471-
TRI = ST.getRegisterInfo();
471+
TII = ST->getInstrInfo();
472+
TRI = ST->getRegisterInfo();
472473
MRI = &MF.getRegInfo();
473-
DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
474+
DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
474475

475476
auto CallingConv = MF.getFunction().getCallingConv();
476477
if (!AMDGPU::isEntryFunctionCC(CallingConv) &&

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,7 +1354,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
13541354
const SIInstrInfo *TII = ST.getInstrInfo();
13551355
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
13561356
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1357-
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1357+
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
13581358
return true;
13591359
}
13601360

@@ -1487,7 +1487,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
14871487

14881488
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
14891489
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1490-
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1490+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
14911491
return true;
14921492
}
14931493

@@ -1651,7 +1651,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
16511651
} else {
16521652
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
16531653
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1654-
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1654+
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
16551655
}
16561656

16571657
return true;
@@ -1809,7 +1809,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
18091809

18101810
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
18111811
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1812-
.addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1812+
.addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
18131813

18141814
return true;
18151815
}
@@ -1895,7 +1895,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
18951895
// avoided.
18961896
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
18971897
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1898-
.addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1898+
.addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
18991899

19001900
return true;
19011901
}
@@ -3404,7 +3404,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
34043404
};
34053405

34063406
const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3407-
AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
3407+
AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST),
3408+
0),
34083409
0);
34093410
auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
34103411
switch (I.getOpcode()) {
@@ -3456,9 +3457,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
34563457

34573458
// Compute counter mask
34583459
unsigned DepCtr =
3459-
IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
3460-
: AMDGPU::DepCtr::encodeFieldVaSdst(0))
3461-
: AMDGPU::DepCtr::encodeFieldSaSdst(0);
3460+
IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3461+
: AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3462+
: AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
34623463

34633464
// Try to merge previous waits into this one for regions with no SGPR reads.
34643465
if (!WaitInstrs.empty()) {
@@ -3723,7 +3724,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
37233724
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
37243725
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
37253726
.addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
3726-
AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
3727+
AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0));
37273728
return true;
37283729
}
37293730

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2934,7 +2934,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
29342934
auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
29352935
if (FlushSGPRWrites)
29362936
BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2937-
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2937+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
29382938
};
29392939

29402940
// We need to compute the offset relative to the instruction immediately after

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2052,56 +2052,63 @@ unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
20522052
return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
20532053
}
20542054

2055-
unsigned encodeFieldVmVsrc(unsigned VmVsrc) {
2056-
return encodeFieldVmVsrc(0xffff, VmVsrc);
2055+
unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) {
2056+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2057+
return encodeFieldVmVsrc(Encoded, VmVsrc);
20572058
}
20582059

20592060
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) {
20602061
return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
20612062
}
20622063

2063-
unsigned encodeFieldVaVdst(unsigned VaVdst) {
2064-
return encodeFieldVaVdst(0xffff, VaVdst);
2064+
unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) {
2065+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2066+
return encodeFieldVaVdst(Encoded, VaVdst);
20652067
}
20662068

20672069
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) {
20682070
return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
20692071
}
20702072

2071-
unsigned encodeFieldSaSdst(unsigned SaSdst) {
2072-
return encodeFieldSaSdst(0xffff, SaSdst);
2073+
unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) {
2074+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2075+
return encodeFieldSaSdst(Encoded, SaSdst);
20732076
}
20742077

20752078
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) {
20762079
return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth());
20772080
}
20782081

2079-
unsigned encodeFieldVaSdst(unsigned VaSdst) {
2080-
return encodeFieldVaSdst(0xffff, VaSdst);
2082+
unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) {
2083+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2084+
return encodeFieldVaSdst(Encoded, VaSdst);
20812085
}
20822086

20832087
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) {
20842088
return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth());
20852089
}
20862090

2087-
unsigned encodeFieldVaVcc(unsigned VaVcc) {
2088-
return encodeFieldVaVcc(0xffff, VaVcc);
2091+
unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) {
2092+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2093+
return encodeFieldVaVcc(Encoded, VaVcc);
20892094
}
20902095

20912096
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) {
20922097
return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
20932098
}
20942099

2095-
unsigned encodeFieldVaSsrc(unsigned VaSsrc) {
2096-
return encodeFieldVaSsrc(0xffff, VaSsrc);
2100+
unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) {
2101+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2102+
return encodeFieldVaSsrc(Encoded, VaSsrc);
20972103
}
20982104

20992105
unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) {
21002106
return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth());
21012107
}
21022108

2103-
unsigned encodeFieldHoldCnt(unsigned HoldCnt) {
2104-
return encodeFieldHoldCnt(0xffff, HoldCnt);
2109+
unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) {
2110+
unsigned Encoded = getDefaultDepCtrEncoding(STI);
2111+
return encodeFieldHoldCnt(Encoded, HoldCnt);
21052112
}
21062113

21072114
} // namespace DepCtr

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,43 +1301,43 @@ unsigned decodeFieldVaSsrc(unsigned Encoded);
13011301
unsigned decodeFieldHoldCnt(unsigned Encoded);
13021302

13031303
/// \returns \p VmVsrc as an encoded Depctr immediate.
1304-
unsigned encodeFieldVmVsrc(unsigned VmVsrc);
1304+
unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI);
13051305

13061306
/// \returns \p Encoded combined with encoded \p VmVsrc.
13071307
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc);
13081308

13091309
/// \returns \p VaVdst as an encoded Depctr immediate.
1310-
unsigned encodeFieldVaVdst(unsigned VaVdst);
1310+
unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI);
13111311

13121312
/// \returns \p Encoded combined with encoded \p VaVdst.
13131313
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst);
13141314

13151315
/// \returns \p SaSdst as an encoded Depctr immediate.
1316-
unsigned encodeFieldSaSdst(unsigned SaSdst);
1316+
unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI);
13171317

13181318
/// \returns \p Encoded combined with encoded \p SaSdst.
13191319
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst);
13201320

13211321
/// \returns \p VaSdst as an encoded Depctr immediate.
1322-
unsigned encodeFieldVaSdst(unsigned VaSdst);
1322+
unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI);
13231323

13241324
/// \returns \p Encoded combined with encoded \p VaSdst.
13251325
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst);
13261326

13271327
/// \returns \p VaVcc as an encoded Depctr immediate.
1328-
unsigned encodeFieldVaVcc(unsigned VaVcc);
1328+
unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI);
13291329

13301330
/// \returns \p Encoded combined with encoded \p VaVcc.
13311331
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc);
13321332

13331333
/// \returns \p HoldCnt as an encoded Depctr immediate.
1334-
unsigned encodeFieldHoldCnt(unsigned HoldCnt);
1334+
unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI);
13351335

13361336
/// \returns \p Encoded combined with encoded \p HoldCnt.
1337-
unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded);
1337+
unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt);
13381338

13391339
/// \returns \p VaSsrc as an encoded Depctr immediate.
1340-
unsigned encodeFieldVaSsrc(unsigned VaSsrc);
1340+
unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI);
13411341

13421342
/// \returns \p Encoded combined with encoded \p VaSsrc.
13431343
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc);

llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ define i16 @s_add_i16(i16 inreg %a, i16 inreg %b) {
5050
; GFX12-NEXT: s_wait_bvhcnt 0x0
5151
; GFX12-NEXT: s_wait_kmcnt 0x0
5252
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
53-
; GFX12-NEXT: s_wait_alu 0xfffe
53+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
5454
; GFX12-NEXT: v_mov_b32_e32 v0, s0
5555
; GFX12-NEXT: s_setpc_b64 s[30:31]
5656
%c = add i16 %a, %b
@@ -145,7 +145,7 @@ define i32 @s_add_i32(i32 inreg %a, i32 inreg %b) {
145145
; GFX12-NEXT: s_wait_bvhcnt 0x0
146146
; GFX12-NEXT: s_wait_kmcnt 0x0
147147
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
148-
; GFX12-NEXT: s_wait_alu 0xfffe
148+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
149149
; GFX12-NEXT: v_mov_b32_e32 v0, s0
150150
; GFX12-NEXT: s_setpc_b64 s[30:31]
151151
%c = add i32 %a, %b
@@ -263,11 +263,11 @@ define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
263263
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
264264
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
265265
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
266-
; GFX12-NEXT: s_wait_alu 0xfffe
266+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
267267
; GFX12-NEXT: s_add_co_i32 s2, s2, s3
268-
; GFX12-NEXT: s_wait_alu 0xfffe
268+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
269269
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2
270-
; GFX12-NEXT: s_wait_alu 0xfffe
270+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
271271
; GFX12-NEXT: v_mov_b32_e32 v0, s0
272272
; GFX12-NEXT: s_setpc_b64 s[30:31]
273273
%c = add <2 x i16> %a, %b
@@ -374,7 +374,7 @@ define i64 @s_add_i64(i64 inreg %a, i64 inreg %b) {
374374
; GFX12-NEXT: s_wait_bvhcnt 0x0
375375
; GFX12-NEXT: s_wait_kmcnt 0x0
376376
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
377-
; GFX12-NEXT: s_wait_alu 0xfffe
377+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
378378
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
379379
; GFX12-NEXT: s_setpc_b64 s[30:31]
380380
%c = add i64 %a, %b
@@ -425,7 +425,7 @@ define i64 @v_add_i64(i64 %a, i64 %b) {
425425
; GFX12-NEXT: s_wait_bvhcnt 0x0
426426
; GFX12-NEXT: s_wait_kmcnt 0x0
427427
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
428-
; GFX12-NEXT: s_wait_alu 0xfffd
428+
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
429429
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
430430
; GFX12-NEXT: s_setpc_b64 s[30:31]
431431
%c = add i64 %a, %b
@@ -513,7 +513,7 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt
513513
; GFX12-NEXT: s_add_co_u32 s0, s0, s2
514514
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3
515515
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
516-
; GFX12-NEXT: s_wait_alu 0xfffe
516+
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
517517
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
518518
; GFX12-NEXT: v_mov_b32_e32 v6, s2
519519
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
@@ -593,9 +593,9 @@ define void @v_uaddo_uadde(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(
593593
; GFX12-NEXT: s_wait_bvhcnt 0x0
594594
; GFX12-NEXT: s_wait_kmcnt 0x0
595595
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
596-
; GFX12-NEXT: s_wait_alu 0xfffd
596+
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
597597
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
598-
; GFX12-NEXT: s_wait_alu 0xfffd
598+
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
599599
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
600600
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
601601
; GFX12-NEXT: global_store_b32 v[6:7], v2, off

0 commit comments

Comments
 (0)