Skip to content

Commit b93b955

Browse files
committed
[AMDGPU] Emit separate v_mov_b32s if v_mov_b64_pseudo destination vgprs are misaligned
1 parent 3be8294 commit b93b955

File tree

3 files changed

+69
-2
lines changed

3 files changed

+69
-2
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,7 +2149,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21492149
const MachineOperand &SrcOp = MI.getOperand(1);
21502150
// FIXME: Will this work for 64-bit floating point immediates?
21512151
assert(!SrcOp.isFPImm());
2152-
if (ST.hasMovB64()) {
2152+
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
2153+
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Dst);
2154+
if (ST.hasMovB64() && RI.isProperlyAlignedRC(*RC)) {
21532155
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
21542156
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
21552157
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
@@ -2159,7 +2161,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21592161
APInt Imm(64, SrcOp.getImm());
21602162
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
21612163
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2162-
if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2164+
if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2165+
RI.isProperlyAlignedRC(*RC)) {
21632166
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
21642167
.addImm(SISrcMods::OP_SEL_1)
21652168
.addImm(Lo.getSExtValue())
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
2+
3+
# CHECK: "misaligned-regsequence":
4+
# CHECK: ; %bb.0:
5+
# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6+
# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
7+
# CHECK: v_mov_b32_e32 v5, 0
8+
# CHECK: v_mov_b32_e32 v4, 0
9+
# CHECK: v_mov_b32_e32 v6, 0
10+
# CHECK: s_waitcnt lgkmcnt(0)
11+
# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
12+
# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
13+
# CHECK: s_endpgm
14+
15+
--- |
16+
define void @misaligned-regsequence() { ret void }
17+
...
18+
---
19+
name: misaligned-regsequence
20+
tracksRegLiveness: true
21+
body: |
22+
bb.0:
23+
liveins: $sgpr4_sgpr5
24+
25+
%3:sgpr_64(p4) = COPY $sgpr4_sgpr5
26+
%8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3:sgpr_64(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
27+
%9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
28+
%10:vreg_64_align2 = COPY %8:sreg_64_xexec
29+
%11:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
30+
%13:vreg_96_align2 = REG_SEQUENCE killed %9:vgpr_32, %subreg.sub0, killed %11:vreg_64_align2, %subreg.sub1_sub2
31+
FLAT_STORE_DWORDX3 %10:vreg_64_align2, killed %13:vreg_96_align2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
32+
S_ENDPGM 0
33+
...
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=postrapseudos %s -o - | FileCheck %s
2+
3+
# CHECK: v_mov_b64_misalign:
4+
# CHECK: ; %bb.0:
5+
# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6+
# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
7+
# CHECK: v_mov_b32_e32 v5, 0
8+
# CHECK: v_mov_b32_e32 v4, 0
9+
# CHECK: v_mov_b32_e32 v6, 0
10+
# CHECK: s_waitcnt lgkmcnt(0)
11+
# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
12+
# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
13+
# CHECK: s_endpgm
14+
15+
---
16+
name: v_mov_b64_misalign
17+
tracksRegLiveness: true
18+
body: |
19+
bb.0.entry:
20+
liveins: $sgpr4_sgpr5
21+
22+
frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
23+
frame-setup CFI_INSTRUCTION undefined $pc_reg
24+
renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
25+
renamable $vgpr4 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
26+
renamable $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
27+
renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1, implicit $exec
28+
FLAT_STORE_DWORDX3 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5_vgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
29+
S_ENDPGM 0
30+
...
31+

0 commit comments

Comments
 (0)