Skip to content

Commit eeced0d

Browse files
authored
[AMDGPU] Use larger immediate values in S_NOP (llvm#158990)
The S_NOP instruction has an immediate operand which is one less than the number of cycles to delay for. The maximum value that may be encoded in this field was increased in GFX8 and again in GFX12.
1 parent a42aac5 commit eeced0d

31 files changed

+2584
-3452
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1839,6 +1839,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18391839
/// \returns true if the subtarget requires a wait for xcnt before atomic
18401840
/// flat/global stores & rmw.
18411841
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
1842+
1843+
/// \returns the number of significant bits in the immediate field of the
1844+
/// S_NOP instruction.
1845+
unsigned getSNopBits() const {
1846+
if (getGeneration() >= AMDGPUSubtarget::GFX12)
1847+
return 7;
1848+
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1849+
return 4;
1850+
return 3;
1851+
}
18421852
};
18431853

18441854
class GCNUserSGPRUsageInfo {

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
19321932
MachineBasicBlock::iterator MI,
19331933
unsigned Quantity) const {
19341934
DebugLoc DL = MBB.findDebugLoc(MI);
1935+
unsigned MaxSNopCount = 1u << ST.getSNopBits();
19351936
while (Quantity > 0) {
1936-
unsigned Arg = std::min(Quantity, 8u);
1937+
unsigned Arg = std::min(Quantity, MaxSNopCount);
19371938
Quantity -= Arg;
19381939
BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
19391940
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
5858
; GCN-NEXT: s_nop 1
5959
; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
6060
; GCN-NEXT: v_mov_b32_e32 v0, 0
61-
; GCN-NEXT: s_nop 7
62-
; GCN-NEXT: s_nop 7
61+
; GCN-NEXT: s_nop 15
6362
; GCN-NEXT: s_nop 1
6463
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
6564
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
109108
; GCN-NEXT: s_nop 1
110109
; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
111110
; GCN-NEXT: v_mov_b32_e32 v0, 0
112-
; GCN-NEXT: s_nop 7
113-
; GCN-NEXT: s_nop 1
111+
; GCN-NEXT: s_nop 9
114112
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
115113
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
116114
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
185183
; GCN-NEXT: s_nop 1
186184
; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
187185
; GCN-NEXT: v_mov_b32_e32 v0, 0
188-
; GCN-NEXT: s_nop 7
189-
; GCN-NEXT: s_nop 7
186+
; GCN-NEXT: s_nop 15
190187
; GCN-NEXT: s_nop 1
191188
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
192189
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
220217
; GCN-NEXT: s_nop 1
221218
; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
222219
; GCN-NEXT: v_mov_b32_e32 v0, 0
223-
; GCN-NEXT: s_nop 7
224-
; GCN-NEXT: s_nop 1
220+
; GCN-NEXT: s_nop 9
225221
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
226222
; GCN-NEXT: s_endpgm
227223
bb:
@@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
277273
; GCN-NEXT: s_nop 1
278274
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
279275
; GCN-NEXT: v_mov_b32_e32 v0, 0
280-
; GCN-NEXT: s_nop 7
281-
; GCN-NEXT: s_nop 7
276+
; GCN-NEXT: s_nop 15
282277
; GCN-NEXT: s_nop 0
283278
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
284279
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
@@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
302297
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
303298
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
304299
; GCN-NEXT: v_mov_b32_e32 v0, 0
305-
; GCN-NEXT: s_nop 7
306-
; GCN-NEXT: s_nop 7
300+
; GCN-NEXT: s_nop 15
307301
; GCN-NEXT: s_nop 0
308302
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
309303
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
336330
; GCN-NEXT: s_nop 1
337331
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
338332
; GCN-NEXT: v_mov_b32_e32 v0, 0
339-
; GCN-NEXT: s_nop 7
340-
; GCN-NEXT: s_nop 7
333+
; GCN-NEXT: s_nop 15
341334
; GCN-NEXT: s_nop 0
342335
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
343336
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
369362
; GCN-NEXT: s_nop 1
370363
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
371364
; GCN-NEXT: v_mov_b32_e32 v0, 0
372-
; GCN-NEXT: s_nop 7
373-
; GCN-NEXT: s_nop 7
365+
; GCN-NEXT: s_nop 15
374366
; GCN-NEXT: s_nop 0
375367
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
376368
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16

llvm/test/CodeGen/AMDGPU/acc-ldst.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
99
; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
1010
; GCN-NOT: v_accvgpr_write
1111
; GCN: v_mfma_f32_32x32x1f32
12-
; GCN-NEXT: s_nop 7
13-
; GCN-NEXT: s_nop 7
12+
; GCN-NEXT: s_nop 15
1413
; GCN-NEXT: s_nop 2
1514
; GCN-NOT: v_accvgpr_read
1615
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
@@ -28,8 +27,7 @@ bb:
2827
; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
2928
; GCN-NOT: v_accvgpr_read
3029
; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
31-
; GCN-NEXT: s_nop 7
32-
; GCN-NEXT: s_nop 7
30+
; GCN-NEXT: s_nop 15
3331
; GCN-NEXT: s_nop 2
3432
; GCN-NOT: v_accvgpr_read
3533
; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
@@ -80,8 +78,7 @@ bb:
8078
; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
8179
; GCN-COUNT-32: v_accvgpr_write
8280
; GCN: v_mfma_f32_32x32x1f32
83-
; GCN-NEXT: s_nop 7
84-
; GCN-NEXT: s_nop 7
81+
; GCN-NEXT: s_nop 15
8582
; GCN-NEXT: s_nop 2
8683
; GCN-NOT: v_accvgpr_read
8784
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
6363
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
6464
; GFX908-NEXT: s_nop 0
6565
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
66-
; GFX908-NEXT: s_nop 7
67-
; GFX908-NEXT: s_nop 1
66+
; GFX908-NEXT: s_nop 9
6867
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
6968
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
7069
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
181180
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
182181
; GFX90A-NEXT: s_nop 1
183182
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
184-
; GFX90A-NEXT: s_nop 7
185-
; GFX90A-NEXT: s_nop 2
183+
; GFX90A-NEXT: s_nop 10
186184
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
187185
; GFX90A-NEXT: s_nop 0
188186
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
487485
; GFX90A-NEXT: ; copy
488486
; GFX90A-NEXT: ;;#ASMEND
489487
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse
490-
; GFX90A-NEXT: s_nop 7
491-
; GFX90A-NEXT: s_nop 1
488+
; GFX90A-NEXT: s_nop 9
492489
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
493490
; GFX90A-NEXT: ;;#ASMSTART
494491
; GFX90A-NEXT: ; use a3 v[0:31]
@@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
965962
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
966963
; GFX908-NEXT: s_nop 0
967964
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
968-
; GFX908-NEXT: s_nop 7
969-
; GFX908-NEXT: s_nop 1
965+
; GFX908-NEXT: s_nop 9
970966
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
971967
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
972968
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
10841080
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
10851081
; GFX90A-NEXT: s_nop 0
10861082
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
1087-
; GFX90A-NEXT: s_nop 7
1088-
; GFX90A-NEXT: s_nop 2
1083+
; GFX90A-NEXT: s_nop 10
10891084
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
10901085
; GFX90A-NEXT: s_nop 0
10911086
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,52 +63,41 @@ body: |
6363
; GCN16-NEXT: successors: %bb.1(0x80000000)
6464
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
6565
; GCN16-NEXT: {{ $}}
66-
; GCN16-NEXT: S_NOP 7
67-
; GCN16-NEXT: S_NOP 7
66+
; GCN16-NEXT: S_NOP 15
6867
; GCN16-NEXT: S_BRANCH %bb.1
6968
; GCN16-NEXT: {{ $}}
7069
; GCN16-NEXT: bb.1:
7170
; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
7271
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
7372
; GCN16-NEXT: {{ $}}
74-
; GCN16-NEXT: S_NOP 7
75-
; GCN16-NEXT: S_NOP 7
73+
; GCN16-NEXT: S_NOP 15
7674
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
77-
; GCN16-NEXT: S_NOP 7
78-
; GCN16-NEXT: S_NOP 7
75+
; GCN16-NEXT: S_NOP 15
7976
; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
8077
; GCN16-NEXT: {{ $}}
8178
; GCN16-NEXT: bb.2:
8279
; GCN16-NEXT: successors: %bb.3(0x80000000)
8380
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
8481
; GCN16-NEXT: {{ $}}
85-
; GCN16-NEXT: S_NOP 7
86-
; GCN16-NEXT: S_NOP 7
82+
; GCN16-NEXT: S_NOP 15
8783
; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
88-
; GCN16-NEXT: S_NOP 7
89-
; GCN16-NEXT: S_NOP 7
84+
; GCN16-NEXT: S_NOP 15
9085
; GCN16-NEXT: S_NOP 0
91-
; GCN16-NEXT: S_NOP 7
92-
; GCN16-NEXT: S_NOP 7
86+
; GCN16-NEXT: S_NOP 15
9387
; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
94-
; GCN16-NEXT: S_NOP 7
95-
; GCN16-NEXT: S_NOP 7
88+
; GCN16-NEXT: S_NOP 15
9689
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
97-
; GCN16-NEXT: S_NOP 7
98-
; GCN16-NEXT: S_NOP 7
90+
; GCN16-NEXT: S_NOP 15
9991
; GCN16-NEXT: S_BRANCH %bb.3
10092
; GCN16-NEXT: {{ $}}
10193
; GCN16-NEXT: bb.3:
10294
; GCN16-NEXT: liveins: $sgpr10_sgpr11
10395
; GCN16-NEXT: {{ $}}
104-
; GCN16-NEXT: S_NOP 7
105-
; GCN16-NEXT: S_NOP 7
96+
; GCN16-NEXT: S_NOP 15
10697
; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
107-
; GCN16-NEXT: S_NOP 7
108-
; GCN16-NEXT: S_NOP 7
98+
; GCN16-NEXT: S_NOP 15
10999
; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
110-
; GCN16-NEXT: S_NOP 7
111-
; GCN16-NEXT: S_NOP 7
100+
; GCN16-NEXT: S_NOP 15
112101
; GCN16-NEXT: SI_RETURN
113102
bb.0:
114103
liveins: $sgpr6, $sgpr10_sgpr11

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
8787
; GFX908-NEXT: v_mov_b32_e32 v0, 2
8888
; GFX908-NEXT: s_nop 1
8989
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
90-
; GFX908-NEXT: s_nop 7
91-
; GFX908-NEXT: s_nop 7
90+
; GFX908-NEXT: s_nop 15
9291
; GFX908-NEXT: s_nop 1
9392
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
9493
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
191190
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
192191
; GFX90A-NEXT: s_nop 1
193192
; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
194-
; GFX90A-NEXT: s_nop 7
195-
; GFX90A-NEXT: s_nop 7
193+
; GFX90A-NEXT: s_nop 15
196194
; GFX90A-NEXT: s_nop 2
197195
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
198196
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
256254
; GFX908-NEXT: v_mov_b32_e32 v1, 2
257255
; GFX908-NEXT: s_nop 1
258256
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
259-
; GFX908-NEXT: s_nop 7
260-
; GFX908-NEXT: s_nop 1
257+
; GFX908-NEXT: s_nop 9
261258
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
262259
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
263260
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
@@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
308305
; GFX90A-NEXT: s_nop 1
309306
; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
310307
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
311-
; GFX90A-NEXT: s_nop 7
312-
; GFX90A-NEXT: s_nop 1
308+
; GFX90A-NEXT: s_nop 9
313309
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
314310
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
315311
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
424420
; GFX908-NEXT: v_mov_b32_e32 v1, 2
425421
; GFX908-NEXT: s_nop 1
426422
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
427-
; GFX908-NEXT: s_nop 7
428-
; GFX908-NEXT: s_nop 7
423+
; GFX908-NEXT: s_nop 15
429424
; GFX908-NEXT: s_nop 1
430425
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
431426
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
@@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
476471
; GFX90A-NEXT: s_nop 1
477472
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
478473
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
479-
; GFX90A-NEXT: s_nop 7
480-
; GFX90A-NEXT: s_nop 7
474+
; GFX90A-NEXT: s_nop 15
481475
; GFX90A-NEXT: s_nop 1
482476
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
483477
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
513507
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
514508
; GFX908-NEXT: s_nop 0
515509
; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
516-
; GFX908-NEXT: s_nop 7
517-
; GFX908-NEXT: s_nop 1
510+
; GFX908-NEXT: s_nop 9
518511
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
519512
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
520513
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
@@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
538531
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
539532
; GFX90A-NEXT: s_nop 1
540533
; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
541-
; GFX90A-NEXT: s_nop 7
542-
; GFX90A-NEXT: s_nop 2
534+
; GFX90A-NEXT: s_nop 10
543535
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
544536
; GFX90A-NEXT: s_endpgm
545537
bb:

0 commit comments

Comments
 (0)