Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,14 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
return NSAThreshold;
}

unsigned GCNSubtarget::getSNopBits() const {
if (getGeneration() >= AMDGPUSubtarget::GFX12)
return 7;
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 4;
return 3;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull definition into header?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
const GCNSubtarget &ST)
: ST(ST) {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1839,6 +1839,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// \returns true if the subtarget requires a wait for xcnt before atomic
/// flat/global stores & rmw.
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }

/// \returns the number of significant bits in the immediate field of the
/// S_NOP instruction.
unsigned getSNopBits() const;
};

class GCNUserSGPRUsageInfo {
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned Quantity) const {
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned MaxSNopCount = 1u << ST.getSNopBits();
while (Quantity > 0) {
unsigned Arg = std::min(Quantity, 8u);
unsigned Arg = std::min(Quantity, MaxSNopCount);
Quantity -= Arg;
BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
}
Expand Down
24 changes: 8 additions & 16 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
Expand Down Expand Up @@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
; GCN-NEXT: s_nop 9
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
Expand Down Expand Up @@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
Expand Down Expand Up @@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
; GCN-NEXT: s_nop 9
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
bb:
Expand Down Expand Up @@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
Expand All @@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
Expand Down Expand Up @@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
Expand Down Expand Up @@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/acc-ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-NOT: v_accvgpr_write
; GCN: v_mfma_f32_32x32x1f32
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
Expand All @@ -28,8 +27,7 @@ bb:
; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-NOT: v_accvgpr_read
; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
Expand Down Expand Up @@ -80,8 +78,7 @@ bb:
; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-COUNT-32: v_accvgpr_write
; GCN: v_mfma_f32_32x32x1f32
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
Expand Down
15 changes: 5 additions & 10 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
Expand Down Expand Up @@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
Expand Down Expand Up @@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a3 v[0:31]
Expand Down Expand Up @@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
Expand Down Expand Up @@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
Expand Down
33 changes: 11 additions & 22 deletions llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
Original file line number Diff line number Diff line change
Expand Up @@ -63,52 +63,41 @@ body: |
; GCN16-NEXT: successors: %bb.1(0x80000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_BRANCH %bb.1
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.1:
; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.2:
; GCN16-NEXT: successors: %bb.3(0x80000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_NOP 0
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_BRANCH %bb.3
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.3:
; GCN16-NEXT: liveins: $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: SI_RETURN
bb.0:
liveins: $sgpr6, $sgpr10_sgpr11
Expand Down
24 changes: 8 additions & 16 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v0, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
Expand Down Expand Up @@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
Expand Down Expand Up @@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
Expand Down Expand Up @@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
Expand Down Expand Up @@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
Expand Down Expand Up @@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
Expand Down Expand Up @@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
Expand All @@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
bb:
Expand Down
Loading