diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 630fdc8e8891c..fbdb14ebe83ce 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2131,11 +2131,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32); + const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0); + const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); - if (ST.hasMovB64()) { - MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); + if (ST.hasMovB64() && Mov64RC->contains(Dst)) { + MI.setDesc(Mov64Desc); if (SrcOp.isReg() || isInlineConstant(MI, 1) || isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals()) break; @@ -2144,17 +2147,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { APInt Imm(64, SrcOp.getImm()); APInt Lo(32, Imm.getLoBits(32).getZExtValue()); APInt Hi(32, Imm.getHiBits(32).getZExtValue()); - if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) { - BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) - .addImm(SISrcMods::OP_SEL_1) - .addImm(Lo.getSExtValue()) - .addImm(SISrcMods::OP_SEL_1) - .addImm(Lo.getSExtValue()) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0); // clamp + const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32); + const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0); + + if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) && + PkMovRC->contains(Dst)) { + BuildMI(MBB, MI, DL, PkMovDesc, Dst) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp } else { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) .addImm(Lo.getSExtValue()) @@ -5172,7 +5179,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // aligned register constraint. // FIXME: We do not verify inline asm operands, but custom inline asm // verification is broken anyway - if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) { + if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO && + Opcode != AMDGPU::V_MOV_B64_PSEUDO) { const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { if (const TargetRegisterClass *SubRC = diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1282ece9dc875..f465a0efa8ddd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst), +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)> { let isReMaterializable = 1; let isAsCheapAsAMove = 1; diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir index c52347b680371..d08185a9e0ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir +++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir @@ -208,3 +208,23 @@ body: | ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2 $vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec ... + +--- +name: av_mov_b64_misalign_vgpr +body: | + bb.0: + ; CHECK-LABEL: name: av_mov_b64_misalign_vgpr + ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6 + ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6 + $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec +... + +--- +name: av_mov_b64_misalign_agpr +body: | + bb.0: + ; CHECK-LABEL: name: av_mov_b64_misalign_agpr + ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6 + ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6 + $agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir new file mode 100644 index 0000000000000..c55c8bb4c506e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir @@ -0,0 +1,30 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s + +# CHECK: misaligned_regsequence: +# CHECK: ; %bb.0: +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0 +# CHECK: v_mov_b32_e32 v5, 0 +# CHECK: v_mov_b32_e32 v4, 0 +# CHECK: v_mov_b32_e32 v6, 0 +# CHECK: s_waitcnt lgkmcnt(0) +# CHECK: v_mov_b64_e32 v[2:3], s[0:1] +# CHECK: flat_store_dwordx3 v[2:3], v[4:6] +# CHECK: s_endpgm + +--- +name: misaligned_regsequence +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr4_sgpr5 + + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3:sgpr_64(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %10:vreg_64_align2 = COPY %8:sreg_64_xexec + %11:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + %13:vreg_96_align2 = REG_SEQUENCE killed %9:vgpr_32, %subreg.sub0, killed %11:vreg_64_align2, %subreg.sub1_sub2 + FLAT_STORE_DWORDX3 %10:vreg_64_align2, killed %13:vreg_96_align2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir index 21455a9f5074f..176d7752133cf 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir @@ -425,7 +425,7 @@ body: | bb.0: ; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec - ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec %1:vreg_64 = COPY killed %0 @@ -437,7 +437,8 @@ name: fold_v_mov_b64_pseudo_64_to_unaligned body: | bb.0: ; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned - ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN: [[V_MOV_B64_PSEUDO:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec %1:vreg_64 = COPY killed %0 diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll new file mode 100644 index 0000000000000..e95aba71775b5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s + +define amdgpu_kernel void @foo(ptr %0) { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[2:4] +; CHECK-NEXT: s_endpgm +entry: + %1 = getelementptr inbounds i8, ptr %0, i64 4 + store i32 0, ptr %0, align 4 + store i64 0, ptr %1, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir index 70e2987454192..4c68c4519302a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir +++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir @@ -93,3 +93,12 @@ body: | bb.0: $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec ... + +# GCN-LABEL: name: v_mov_b64_misalign +# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6 +# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6 +name: v_mov_b64_misalign +body: | + bb.0: + $vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec +...