Skip to content

Commit fc5fcc0

Browse files
authored
AMDGPU: Start using AV_MOV_B64_IMM_PSEUDO (llvm#154500)
1 parent 01f785c commit fc5fcc0

File tree

7 files changed

+391
-344
lines changed

7 files changed

+391
-344
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class AMDGPUPrepareAGPRAllocImpl {
3434
const SIInstrInfo &TII;
3535
MachineRegisterInfo &MRI;
3636

37+
bool isAV64Imm(const MachineOperand &MO) const;
38+
3739
public:
3840
AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
3941
: TII(*ST.getInstrInfo()), MRI(MRI) {}
@@ -85,11 +87,16 @@ AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
8587
return PreservedAnalyses::all();
8688
}
8789

90+
bool AMDGPUPrepareAGPRAllocImpl::isAV64Imm(const MachineOperand &MO) const {
91+
return MO.isImm() && TII.isLegalAV64PseudoImm(MO.getImm());
92+
}
93+
8894
bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
8995
if (MRI.isReserved(AMDGPU::AGPR0))
9096
return false;
9197

92-
const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
98+
const MCInstrDesc &AVImmPseudo32 = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
99+
const MCInstrDesc &AVImmPseudo64 = TII.get(AMDGPU::AV_MOV_B64_IMM_PSEUDO);
93100

94101
bool Changed = false;
95102
for (MachineBasicBlock &MBB : MF) {
@@ -98,8 +105,19 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
98105
TII.isInlineConstant(MI, 1)) ||
99106
(MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
100107
MI.getOperand(1).isImm())) {
101-
MI.setDesc(AVImmPseudo);
108+
MI.setDesc(AVImmPseudo32);
109+
Changed = true;
110+
continue;
111+
}
112+
113+
// TODO: If only half of the value is rewritable, is it worth splitting it
114+
// up?
115+
if ((MI.getOpcode() == AMDGPU::V_MOV_B64_e64 ||
116+
MI.getOpcode() == AMDGPU::V_MOV_B64_PSEUDO) &&
117+
isAV64Imm(MI.getOperand(1))) {
118+
MI.setDesc(AVImmPseudo64);
102119
Changed = true;
120+
continue;
103121
}
104122
}
105123
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,15 +2137,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21372137
case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
21382138
Register Dst = MI.getOperand(0).getReg();
21392139
if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2140-
uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
2140+
int64_t Imm = MI.getOperand(1).getImm();
21412141

21422142
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
21432143
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
21442144
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2145-
.addImm(SignExtend64<32>(Lo_32(Imm)))
2145+
.addImm(SignExtend64<32>(Imm))
21462146
.addReg(Dst, RegState::Implicit | RegState::Define);
21472147
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2148-
.addImm(SignExtend64<32>(Hi_32(Imm)))
2148+
.addImm(SignExtend64<32>(Imm >> 32))
21492149
.addReg(Dst, RegState::Implicit | RegState::Define);
21502150
MI.eraseFromParent();
21512151
break;

llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -114,22 +114,22 @@ body: |
114114
; HAS-AGPR-NEXT: liveins: $vgpr0_vgpr1
115115
; HAS-AGPR-NEXT: {{ $}}
116116
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
117-
; HAS-AGPR-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 54, implicit $exec
118-
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
119-
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 64, implicit $exec
120-
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 %stack.0, implicit $exec
121-
; HAS-AGPR-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 65, implicit $exec
122-
; HAS-AGPR-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874240, implicit $exec
123-
; HAS-AGPR-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874305, implicit $exec
124-
; HAS-AGPR-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
125-
; HAS-AGPR-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
126-
; HAS-AGPR-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
127-
; HAS-AGPR-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
128-
; HAS-AGPR-NEXT: [[V_MOV_B8:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO $vgpr0_vgpr1, implicit $exec
129-
; HAS-AGPR-NEXT: [[V_MOV_B9:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO %stack.0, implicit $exec
117+
; HAS-AGPR-NEXT: [[AV_MOV_:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 54, implicit $exec
118+
; HAS-AGPR-NEXT: [[AV_MOV_1:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1, implicit $exec
119+
; HAS-AGPR-NEXT: [[AV_MOV_2:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
120+
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 %stack.0, implicit $exec
121+
; HAS-AGPR-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 65, implicit $exec
122+
; HAS-AGPR-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874240, implicit $exec
123+
; HAS-AGPR-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874305, implicit $exec
124+
; HAS-AGPR-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
125+
; HAS-AGPR-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
126+
; HAS-AGPR-NEXT: [[AV_MOV_3:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
127+
; HAS-AGPR-NEXT: [[AV_MOV_4:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
128+
; HAS-AGPR-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO $vgpr0_vgpr1, implicit $exec
129+
; HAS-AGPR-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO %stack.0, implicit $exec
130130
; HAS-AGPR-NEXT: {{ $}}
131131
; HAS-AGPR-NEXT: bb.1:
132-
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 3, implicit $exec
132+
; HAS-AGPR-NEXT: [[AV_MOV_5:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 3, implicit $exec
133133
;
134134
; NO-AGPR-LABEL: name: func64
135135
; NO-AGPR: bb.0:
@@ -181,14 +181,23 @@ tracksRegLiveness: true
181181
body: |
182182
bb.0:
183183
liveins: $vgpr0
184-
; HAS-AGPR-LABEL: name: func64_no_agprs
185-
; HAS-AGPR: liveins: $vgpr0
186-
; HAS-AGPR-NEXT: {{ $}}
187-
; HAS-AGPR-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
188-
; HAS-AGPR-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
189-
; HAS-AGPR-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
190-
; HAS-AGPR-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
191-
; HAS-AGPR-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
184+
; GFX90A-LABEL: name: func64_no_agprs
185+
; GFX90A: liveins: $vgpr0
186+
; GFX90A-NEXT: {{ $}}
187+
; GFX90A-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
188+
; GFX90A-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
189+
; GFX90A-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
190+
; GFX90A-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
191+
; GFX90A-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
192+
;
193+
; GFX908-LABEL: name: func64_no_agprs
194+
; GFX908: liveins: $vgpr0
195+
; GFX908-NEXT: {{ $}}
196+
; GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1, implicit $exec
197+
; GFX908-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
198+
; GFX908-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
199+
; GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
200+
; GFX908-NEXT: [[AV_MOV_2:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
192201
;
193202
; NO-AGPR-LABEL: name: func64_no_agprs
194203
; NO-AGPR: liveins: $vgpr0

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
1616
; CHECK-NEXT: s_bitcmp1_b32 s0, 8
1717
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
1818
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
19-
; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1
2019
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
21-
; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3]
2220
; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78
21+
; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
22+
; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1
23+
; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3]
24+
; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
2325
; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f
2426
; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90
2527
; CHECK-NEXT: v_mov_b32_e32 v5, 0x3efa01a0
@@ -34,14 +36,14 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
3436
; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883
3537
; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4
3638
; CHECK-NEXT: s_mov_b64 s[22:23], 0
37-
; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036
38-
; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136
39+
; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036
40+
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136
3941
; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17]
4042
; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523
4143
; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555
4244
; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19]
4345
; CHECK-NEXT: v_mov_b32_e32 v20, 0
44-
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
46+
; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31
4547
; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23
4648
; CHECK-NEXT: s_branch .LBB0_2
4749
; CHECK-NEXT: .LBB0_1: ; %Flow9
@@ -61,9 +63,12 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
6163
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
6264
; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
6365
; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25]
64-
; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1]
66+
; CHECK-NEXT: v_accvgpr_read_b32 v27, a3
67+
; CHECK-NEXT: v_accvgpr_read_b32 v26, a2
6568
; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3]
66-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[16:17]
69+
; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1]
70+
; CHECK-NEXT: v_accvgpr_write_b32 a0, 0
71+
; CHECK-NEXT: v_accvgpr_write_b32 a1, 0
6772
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6873
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[24:25]
6974
; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
@@ -79,10 +84,9 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
7984
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29]
8085
; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15]
8186
; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
82-
; CHECK-NEXT: v_fmac_f64_e32 v[30:31], 0, v[28:29]
87+
; CHECK-NEXT: v_fmac_f64_e32 v[16:17], 0, v[28:29]
8388
; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19]
84-
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[30:31]
85-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], 0
89+
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17]
8690
; CHECK-NEXT: s_branch .LBB0_6
8791
; CHECK-NEXT: .LBB0_5: ; %Flow
8892
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
@@ -91,30 +95,30 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
9195
; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329
9296
; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1
9397
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
94-
; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[30:31]
98+
; CHECK-NEXT: v_accvgpr_read_b32 v29, a1
99+
; CHECK-NEXT: v_accvgpr_read_b32 v28, a0
95100
; CHECK-NEXT: s_mov_b64 s[24:25], -1
96101
; CHECK-NEXT: s_mov_b64 s[8:9], -1
97102
; CHECK-NEXT: s_mov_b64 vcc, s[2:3]
98-
; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31
103+
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
99104
; CHECK-NEXT: s_cbranch_vccz .LBB0_5
100105
; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291
101106
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
102-
; CHECK-NEXT: v_accvgpr_read_b32 v31, a1
103-
; CHECK-NEXT: v_accvgpr_read_b32 v30, a0
107+
; CHECK-NEXT: v_accvgpr_write_b32 a0, v30
108+
; CHECK-NEXT: v_accvgpr_write_b32 a1, v31
104109
; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19]
105110
; CHECK-NEXT: s_mov_b64 vcc, s[6:7]
106111
; CHECK-NEXT: s_cbranch_vccz .LBB0_5
107112
; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325
108113
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
114+
; CHECK-NEXT: v_accvgpr_write_b32 a0, v26
109115
; CHECK-NEXT: s_mov_b64 s[24:25], 0
110-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[26:27]
116+
; CHECK-NEXT: v_accvgpr_write_b32 a1, v27
111117
; CHECK-NEXT: s_mov_b64 s[8:9], 0
112118
; CHECK-NEXT: s_branch .LBB0_5
113119
; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
114-
; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[10:11]
115-
; CHECK-NEXT: v_accvgpr_write_b32 a0, v24
116120
; CHECK-NEXT: s_mov_b64 s[22:23], 0
117-
; CHECK-NEXT: v_accvgpr_write_b32 a1, v25
121+
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
118122
; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21]
119123
; CHECK-NEXT: s_branch .LBB0_15
120124
; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
@@ -128,24 +132,22 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
128132
; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit
129133
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
130134
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29]
131-
; CHECK-NEXT: v_accvgpr_write_b32 a0, v24
132135
; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17]
133-
; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9]
134-
; CHECK-NEXT: v_mov_b32_e32 v27, v26
135-
; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17]
136136
; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17]
137-
; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13]
137+
; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9]
138+
; CHECK-NEXT: v_mov_b32_e32 v17, v16
139+
; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17]
140+
; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13]
138141
; CHECK-NEXT: s_cselect_b32 s23, s23, 0
139142
; CHECK-NEXT: s_cselect_b32 s22, s22, 0
140143
; CHECK-NEXT: s_mov_b64 s[8:9], -1
141144
; CHECK-NEXT: s_branch .LBB0_14
142145
; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
143-
; CHECK-NEXT: v_accvgpr_write_b32 a0, v24
144146
; CHECK-NEXT: s_mov_b64 s[8:9], 0
145147
; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0
146148
; CHECK-NEXT: .LBB0_14: ; %Flow6
147149
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
148-
; CHECK-NEXT: v_accvgpr_write_b32 a1, v25
150+
; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25]
149151
; CHECK-NEXT: .LBB0_15: ; %Flow6
150152
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
151153
; CHECK-NEXT: s_mov_b64 s[24:25], -1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
3939
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
4040
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
4141
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
42-
; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
4342
; GCN-NEXT: v_mov_b32_e32 v16, s16
43+
; GCN-NEXT: v_mov_b32_e32 v17, s17
4444
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
45+
; GCN-NEXT: v_mov_b32_e32 v18, s18
46+
; GCN-NEXT: v_mov_b32_e32 v19, s19
4547
; GCN-NEXT: v_mov_b32_e32 v0, s20
4648
; GCN-NEXT: v_mov_b32_e32 v1, s21
4749
; GCN-NEXT: v_mov_b32_e32 v2, s22
4850
; GCN-NEXT: v_mov_b32_e32 v3, s23
49-
; GCN-NEXT: v_mov_b32_e32 v17, s17
50-
; GCN-NEXT: v_mov_b32_e32 v18, s18
51-
; GCN-NEXT: v_mov_b32_e32 v19, s19
51+
; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
5252
; GCN-NEXT: s_nop 4
5353
; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
5454
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -112,16 +112,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
112112
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
113113
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
114114
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
115-
; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
116115
; GCN-NEXT: v_mov_b32_e32 v16, s16
116+
; GCN-NEXT: v_mov_b32_e32 v17, s17
117117
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
118+
; GCN-NEXT: v_mov_b32_e32 v18, s18
119+
; GCN-NEXT: v_mov_b32_e32 v19, s19
118120
; GCN-NEXT: v_mov_b32_e32 v0, s20
119121
; GCN-NEXT: v_mov_b32_e32 v1, s21
120122
; GCN-NEXT: v_mov_b32_e32 v2, s22
121123
; GCN-NEXT: v_mov_b32_e32 v3, s23
122-
; GCN-NEXT: v_mov_b32_e32 v17, s17
123-
; GCN-NEXT: v_mov_b32_e32 v18, s18
124-
; GCN-NEXT: v_mov_b32_e32 v19, s19
124+
; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
125125
; GCN-NEXT: s_nop 4
126126
; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
127127
; GCN-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)