Skip to content

Commit 560e7df

Browse files
authored
AMDGPU: Handle the co-execution hazards for TRANS for gfx1250 (#149024)
For the co-execution of the TRANS ops, the requirement is: 1 independent op or V_NOP (since TRANS takes 2 cycles) after the trans op before its sources can be overwritten or the output of the trans op can be used.
1 parent ba271cc commit 560e7df

File tree

4 files changed

+181
-0
lines changed

4 files changed

+181
-0
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
11891189
}
11901190
fixVALUPartialForwardingHazard(MI);
11911191
fixVALUTransUseHazard(MI);
1192+
fixVALUTransCoexecutionHazards(MI);
11921193
fixWMMAHazards(MI);
11931194
fixShift64HighRegBug(MI);
11941195
fixVALUMaskWriteHazard(MI);
@@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
18091810
return true;
18101811
}
18111812

1813+
bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1814+
if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
1815+
!SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
1816+
return false;
1817+
1818+
const SIInstrInfo *TII = ST.getInstrInfo();
1819+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
1820+
1821+
auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1822+
if (!SIInstrInfo::isTRANS(I))
1823+
return false;
1824+
1825+
// RAW: Trans(I) writes, VALU(MI) reads.
1826+
Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1827+
for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1828+
if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1829+
return true;
1830+
}
1831+
1832+
auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1833+
if (!ValuDst || !ValuDst->isReg())
1834+
return false;
1835+
1836+
// WAR: Trans(I) reads, VALU(MI) writes.
1837+
Register ValuDef = ValuDst->getReg();
1838+
for (const MachineOperand &TransUse : I.explicit_uses()) {
1839+
if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1840+
return true;
1841+
}
1842+
1843+
return false;
1844+
};
1845+
1846+
auto IsExpiredFn = [](const MachineInstr &I, int) {
1847+
return SIInstrInfo::isVALU(I);
1848+
};
1849+
1850+
const int HasVALU = std::numeric_limits<int>::max();
1851+
if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1852+
return false;
1853+
1854+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1855+
return true;
1856+
}
1857+
18121858
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
18131859
if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
18141860
return false;

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
104104
bool fixLdsDirectVMEMHazard(MachineInstr *MI);
105105
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
106106
bool fixVALUTransUseHazard(MachineInstr *MI);
107+
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
107108
bool fixWMMAHazards(MachineInstr *MI);
108109
bool fixShift64HighRegBug(MachineInstr *MI);
109110
bool fixVALUMaskWriteHazard(MachineInstr *MI);

llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
6666
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
6767
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
6868
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v0.l
69+
; GFX12-TRUE16-NEXT: v_nop
6970
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
7071
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
7172
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
@@ -90,6 +91,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
9091
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
9192
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
9293
; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
94+
; GFX12-FAKE16-NEXT: v_nop
9395
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9496
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
9597
; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1250 %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1200 %s
4+
5+
---
6+
name: trans_writes_valu_reads_hazard
7+
body: |
8+
bb.0:
9+
; GFX1250-LABEL: name: trans_writes_valu_reads_hazard
10+
; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
11+
; GFX1250-NEXT: V_NOP_e32 implicit $exec
12+
; GFX1250-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
13+
;
14+
; GFX1200-LABEL: name: trans_writes_valu_reads_hazard
15+
; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
16+
; GFX1200-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
17+
$vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
18+
$vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
19+
...
20+
21+
---
22+
name: trans_writes_valu_valu_reads_hazard_covered
23+
body: |
24+
bb.0:
25+
; GCN-LABEL: name: trans_writes_valu_valu_reads_hazard_covered
26+
; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
27+
; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
28+
; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
29+
$vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
30+
$vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
31+
$vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
32+
...
33+
34+
---
35+
name: trans_writes_salu_valu_reads_hazard
36+
body: |
37+
bb.0:
38+
; GFX1250-LABEL: name: trans_writes_salu_valu_reads_hazard
39+
; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
40+
; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
41+
; GFX1250-NEXT: V_NOP_e32 implicit $exec
42+
; GFX1250-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
43+
;
44+
; GFX1200-LABEL: name: trans_writes_salu_valu_reads_hazard
45+
; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
46+
; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
47+
; GFX1200-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
48+
$vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
49+
$sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
50+
$vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
51+
...
52+
53+
---
54+
name: trans_no_hazard
55+
body: |
56+
bb.0:
57+
; GCN-LABEL: name: trans_no_hazard
58+
; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
59+
; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec
60+
$vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
61+
$vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec
62+
...
63+
64+
---
65+
name: trans_reads_valu_writes_hazard
66+
body: |
67+
bb.0:
68+
; GFX1250-LABEL: name: trans_reads_valu_writes_hazard
69+
; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
70+
; GFX1250-NEXT: V_NOP_e32 implicit $exec
71+
; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
72+
;
73+
; GFX1200-LABEL: name: trans_reads_valu_writes_hazard
74+
; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
75+
; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
76+
$vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
77+
$vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
78+
...
79+
80+
---
81+
name: trans_reads_valu_valu_writes_hazard_covered
82+
body: |
83+
bb.0:
84+
; GCN-LABEL: name: trans_reads_valu_valu_writes_hazard_covered
85+
; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
86+
; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
87+
; GCN-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
88+
$vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
89+
$vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
90+
$vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
91+
...
92+
93+
---
94+
name: trans_reads__salu_valu_writes_hazard
95+
body: |
96+
bb.0:
97+
; GFX1250-LABEL: name: trans_reads__salu_valu_writes_hazard
98+
; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
99+
; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
100+
; GFX1250-NEXT: V_NOP_e32 implicit $exec
101+
; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
102+
;
103+
; GFX1200-LABEL: name: trans_reads__salu_valu_writes_hazard
104+
; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
105+
; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
106+
; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
107+
$vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
108+
$sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
109+
$vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
110+
...
111+
112+
---
113+
name: trans_writes_trans_reads_no_hazard
114+
body: |
115+
bb.0:
116+
; GCN-LABEL: name: trans_writes_trans_reads_no_hazard
117+
; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
118+
; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
119+
$vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
120+
$vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
121+
...
122+
123+
---
124+
name: trans_reads_trans_writes_no_hazard
125+
body: |
126+
bb.0:
127+
; GCN-LABEL: name: trans_reads_trans_writes_no_hazard
128+
; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
129+
; GCN-NEXT: $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
130+
$vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
131+
$vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
132+
...

0 commit comments

Comments
 (0)