Skip to content

Commit b26e934

Browse files
committed
[AMDGPU] Emit separate v_mov_b32s if v_mov_b64_pseudo destination vgprs are misaligned
1 parent 0c84643 commit b26e934

File tree

7 files changed

+108
-17
lines changed

7 files changed

+108
-17
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2131,11 +2131,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21312131
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
21322132
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
21332133

2134+
const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2135+
const TargetRegisterClass *Mov64RC =
2136+
getRegClass(Mov64Desc, /*OpNum=*/0);
2137+
21342138
const MachineOperand &SrcOp = MI.getOperand(1);
21352139
// FIXME: Will this work for 64-bit floating point immediates?
21362140
assert(!SrcOp.isFPImm());
2137-
if (ST.hasMovB64()) {
2138-
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2141+
if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
2142+
MI.setDesc(Mov64Desc);
21392143
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
21402144
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
21412145
break;
@@ -2144,17 +2148,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21442148
APInt Imm(64, SrcOp.getImm());
21452149
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
21462150
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2147-
if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2148-
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2149-
.addImm(SISrcMods::OP_SEL_1)
2150-
.addImm(Lo.getSExtValue())
2151-
.addImm(SISrcMods::OP_SEL_1)
2152-
.addImm(Lo.getSExtValue())
2153-
.addImm(0) // op_sel_lo
2154-
.addImm(0) // op_sel_hi
2155-
.addImm(0) // neg_lo
2156-
.addImm(0) // neg_hi
2157-
.addImm(0); // clamp
2151+
const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2152+
const TargetRegisterClass *PkMovRC =
2153+
getRegClass(PkMovDesc, /*OpNum=*/0);
2154+
2155+
if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2156+
PkMovRC->contains(Dst)) {
2157+
BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2158+
.addImm(SISrcMods::OP_SEL_1)
2159+
.addImm(Lo.getSExtValue())
2160+
.addImm(SISrcMods::OP_SEL_1)
2161+
.addImm(Lo.getSExtValue())
2162+
.addImm(0) // op_sel_lo
2163+
.addImm(0) // op_sel_hi
2164+
.addImm(0) // neg_lo
2165+
.addImm(0) // neg_hi
2166+
.addImm(0); // clamp
21582167
} else {
21592168
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
21602169
.addImm(Lo.getSExtValue())
@@ -5172,7 +5181,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
51725181
// aligned register constraint.
51735182
// FIXME: We do not verify inline asm operands, but custom inline asm
51745183
// verification is broken anyway
5175-
if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5184+
if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5185+
Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
51765186
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
51775187
if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
51785188
if (const TargetRegisterClass *SubRC =

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
131131

132132
// 64-bit vector move instruction. This is mainly used by the
133133
// SIFoldOperands pass to enable folding of inline immediates.
134-
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
134+
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
135135
(ins VSrc_b64:$src0)> {
136136
let isReMaterializable = 1;
137137
let isAsCheapAsAMove = 1;

llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,23 @@ body: |
208208
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2
209209
$vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
210210
...
211+
212+
---
213+
name: av_mov_b64_misalign_vgpr
214+
body: |
215+
bb.0:
216+
; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
217+
; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
218+
; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
219+
$vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
220+
...
221+
222+
---
223+
name: av_mov_b64_misalign_agpr
224+
body: |
225+
bb.0:
226+
; CHECK-LABEL: name: av_mov_b64_misalign_agpr
227+
; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
228+
; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
229+
$agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
230+
...
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
2+
3+
# CHECK: misaligned_regsequence:
4+
# CHECK: ; %bb.0:
5+
# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6+
# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
7+
# CHECK: v_mov_b32_e32 v5, 0
8+
# CHECK: v_mov_b32_e32 v4, 0
9+
# CHECK: v_mov_b32_e32 v6, 0
10+
# CHECK: s_waitcnt lgkmcnt(0)
11+
# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
12+
# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
13+
# CHECK: s_endpgm
14+
15+
---
16+
name: misaligned_regsequence
17+
tracksRegLiveness: true
18+
body: |
19+
bb.0:
20+
liveins: $sgpr4_sgpr5
21+
22+
%3:sgpr_64(p4) = COPY $sgpr4_sgpr5
23+
%8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3:sgpr_64(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
24+
%9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
25+
%10:vreg_64_align2 = COPY %8:sreg_64_xexec
26+
%11:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
27+
%13:vreg_96_align2 = REG_SEQUENCE killed %9:vgpr_32, %subreg.sub0, killed %11:vreg_64_align2, %subreg.sub1_sub2
28+
FLAT_STORE_DWORDX3 %10:vreg_64_align2, killed %13:vreg_96_align2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
29+
S_ENDPGM 0
30+
...

llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ body: |
425425
bb.0:
426426
; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned
427427
; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
428-
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
428+
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
429429
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
430430
%0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
431431
%1:vreg_64 = COPY killed %0
@@ -437,7 +437,8 @@ name: fold_v_mov_b64_pseudo_64_to_unaligned
437437
body: |
438438
bb.0:
439439
; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned
440-
; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
440+
; GCN: [[V_MOV_B64_PSEUDO:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
441+
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
441442
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
442443
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
443444
%1:vreg_64 = COPY killed %0
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
3+
4+
define amdgpu_kernel void @foo(ptr %0) {
5+
; CHECK-LABEL: foo:
6+
; CHECK: ; %bb.0: ; %entry
7+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
9+
; CHECK-NEXT: v_mov_b32_e32 v3, v2
10+
; CHECK-NEXT: v_mov_b32_e32 v4, v3
11+
; CHECK-NEXT: v_mov_b32_e32 v3, v2
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14+
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
15+
; CHECK-NEXT: s_endpgm
16+
entry:
17+
%1 = getelementptr i8, ptr %0, i64 4
18+
store i32 0, ptr %0, align 4
19+
store i64 0, ptr %1, align 4
20+
ret void
21+
}

llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,12 @@ body: |
9393
bb.0:
9494
$vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
9595
...
96+
97+
# GCN-LABEL: name: v_mov_b64_misalign
98+
# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
99+
# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
100+
name: v_mov_b64_misalign
101+
body: |
102+
bb.0:
103+
$vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
104+
...

0 commit comments

Comments
 (0)