Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 22 additions & 14 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2131,11 +2131,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);

const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);

const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
if (ST.hasMovB64()) {
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
MI.setDesc(Mov64Desc);
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
break;
Expand All @@ -2144,17 +2147,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
.addImm(0) // op_sel_lo
.addImm(0) // op_sel_hi
.addImm(0) // neg_lo
.addImm(0) // neg_hi
.addImm(0); // clamp
const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);

if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
PkMovRC->contains(Dst)) {
BuildMI(MBB, MI, DL, PkMovDesc, Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
.addImm(0) // op_sel_lo
.addImm(0) // op_sel_hi
.addImm(0) // neg_lo
.addImm(0) // neg_hi
.addImm(0); // clamp
} else {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
.addImm(Lo.getSExtValue())
Expand Down Expand Up @@ -5172,7 +5179,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// aligned register constraint.
// FIXME: We do not verify inline asm operands, but custom inline asm
// verification is broken anyway
if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
if (const TargetRegisterClass *SubRC =
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),

// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)> {
let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
Expand Down
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,23 @@ body: |
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2
$vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
...
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These seems to have the intent of the tests I'm adding, but seem to be eliding the verify error due to check on isUInt<32> for the immediate. Should these be changed or merged with the new tests?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AV_MOV_B64_IMM_PSEUDO already doesn't require alignment, I think this already works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of gfx942 (and without the changes of this PR) my added test for vgpr5_vgpr6 seems to emit a misaligned v_mov_b64 (https://godbolt.org/z/ch9qcWc3s)


---
name: av_mov_b64_misalign_vgpr
body: |
bb.0:
; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
$vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
...

---
name: av_mov_b64_misalign_agpr
body: |
bb.0:
; CHECK-LABEL: name: av_mov_b64_misalign_agpr
; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
$agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
...
30 changes: 30 additions & 0 deletions llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a weird way to write the test. Can you replace this with an end to end IR test that hit this problem

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't add the end-to-end IR as #154115 was reverted which showed the code path towards misaligned v_mov_b64. I was planning to add the IR test as a reland of #154115 after this has landed but I can add it as a sort of precommit in here since it's related.


# CHECK: misaligned_regsequence:
# CHECK: ; %bb.0:
# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
# CHECK: v_mov_b32_e32 v5, 0
# CHECK: v_mov_b32_e32 v4, 0
# CHECK: v_mov_b32_e32 v6, 0
# CHECK: s_waitcnt lgkmcnt(0)
# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
# CHECK: s_endpgm

---
name: misaligned_regsequence
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr4_sgpr5

%3:sgpr_64(p4) = COPY $sgpr4_sgpr5
%8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3:sgpr_64(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
%9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%10:vreg_64_align2 = COPY %8:sreg_64_xexec
%11:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
%13:vreg_96_align2 = REG_SEQUENCE killed %9:vgpr_32, %subreg.sub0, killed %11:vreg_64_align2, %subreg.sub1_sub2
FLAT_STORE_DWORDX3 %10:vreg_64_align2, killed %13:vreg_96_align2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
S_ENDPGM 0
...
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ body: |
bb.0:
; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned
; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
%0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
%1:vreg_64 = COPY killed %0
Expand All @@ -437,7 +437,8 @@ name: fold_v_mov_b64_pseudo_64_to_unaligned
body: |
bb.0:
; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned
; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN: [[V_MOV_B64_PSEUDO:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
%1:vreg_64 = COPY killed %0
Expand Down
21 changes: 21 additions & 0 deletions llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s

define amdgpu_kernel void @foo(ptr %0) {
; CHECK-LABEL: foo:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: v_mov_b32_e32 v4, v3
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
; CHECK-NEXT: s_endpgm
entry:
%1 = getelementptr inbounds i8, ptr %0, i64 4
store i32 0, ptr %0, align 4
store i64 0, ptr %1, align 4
ret void
}
9 changes: 9 additions & 0 deletions llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,12 @@ body: |
bb.0:
$vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
...

# GCN-LABEL: name: v_mov_b64_misalign
# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
name: v_mov_b64_misalign
body: |
bb.0:
$vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
...