Skip to content

Commit de1b333

Browse files
authored
Merge cherry picked gfx950 mfma hazard patches (llvm#1015)
2 parents ef8556d + c4a867f commit de1b333

File tree

7 files changed

+421
-394
lines changed

7 files changed

+421
-394
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2585,20 +2585,24 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
25852585
return NumPasses + 2;
25862586
}
25872587

2588-
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2589-
// 2 pass -> 5
2590-
// 4 pass -> 7
2591-
// 8 pass -> 11
2592-
// 16 pass -> 19
2593-
return NumPasses + 3;
2588+
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
2589+
bool IsGFX950) {
2590+
// xdl def cycles | gfx940 | gfx950
2591+
// 2 pass | 5 5
2592+
// 4 pass | 7 8
2593+
// 8 pass | 11 12
2594+
// 16 pass | 19 20
2595+
return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
25942596
}
25952597

2596-
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2597-
// 2 pass -> 5
2598-
// 4 pass -> 7
2599-
// 8 pass -> 11
2600-
// 16 pass -> 19
2601-
return NumPasses + 3;
2598+
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2599+
bool IsGFX950) {
2600+
// xdl def cycles | gfx940 | gfx950
2601+
// 2 pass | 5 5
2602+
// 4 pass | 7 8
2603+
// 8 pass | 11 12
2604+
// 16 pass | 19 20
2605+
return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
26022606
}
26032607

26042608
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
@@ -2749,7 +2753,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
27492753
} else if (ST.hasGFX940Insts()) {
27502754
NeedWaitStates =
27512755
isXDL(ST, *MFMA)
2752-
? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2756+
? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
2757+
NumPasses, ST.hasGFX950Insts())
27532758
: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
27542759
NumPasses);
27552760
} else {
@@ -2835,7 +2840,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
28352840
} else if (ST.hasGFX940Insts()) {
28362841
NeedWaitStates =
28372842
isXDL(ST, *MFMA)
2838-
? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2843+
? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
2844+
NumPasses, ST.hasGFX950Insts())
28392845
: GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
28402846
} else {
28412847
switch (NumPasses) {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
4949
; GCN-NEXT: v_mov_b32_e32 v9, s13
5050
; GCN-NEXT: v_mov_b32_e32 v10, s14
5151
; GCN-NEXT: v_mov_b32_e32 v11, s15
52-
; GCN-NEXT: s_nop 3
52+
; GCN-NEXT: s_nop 4
5353
; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
5454
; GCN-NEXT: s_waitcnt vmcnt(0)
5555
; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
122122
; GCN-NEXT: v_mov_b32_e32 v9, s13
123123
; GCN-NEXT: v_mov_b32_e32 v10, s14
124124
; GCN-NEXT: v_mov_b32_e32 v11, s15
125-
; GCN-NEXT: s_nop 3
125+
; GCN-NEXT: s_nop 4
126126
; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
127127
; GCN-NEXT: s_waitcnt vmcnt(0)
128128
; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
179179
; GCN-NEXT: s_nop 1
180180
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
181181
; GCN-NEXT: s_nop 7
182-
; GCN-NEXT: s_nop 2
182+
; GCN-NEXT: s_nop 3
183183
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
184184
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
185185
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
224224
; GCN-NEXT: s_nop 1
225225
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
226226
; GCN-NEXT: s_nop 7
227-
; GCN-NEXT: s_nop 2
227+
; GCN-NEXT: s_nop 3
228228
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
229229
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
230230
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -396,7 +396,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
396396
; GCN-NEXT: v_mov_b32_e32 v16, 0
397397
; GCN-NEXT: s_waitcnt lgkmcnt(0)
398398
; GCN-NEXT: s_nop 7
399-
; GCN-NEXT: s_nop 0
399+
; GCN-NEXT: s_nop 1
400400
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
401401
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
402402
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -431,7 +431,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
431431
; GCN-NEXT: v_mov_b32_e32 v16, 0
432432
; GCN-NEXT: s_waitcnt lgkmcnt(0)
433433
; GCN-NEXT: s_nop 7
434-
; GCN-NEXT: s_nop 0
434+
; GCN-NEXT: s_nop 1
435435
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
436436
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
437437
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16

0 commit comments

Comments
 (0)