diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 184929a5a50f6..06186ab4e1b2d 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -250,7 +250,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, ++NumOperands; } if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { - if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { + if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::sdst)) { DPPInst.add(*SDst); ++NumOperands; } @@ -296,11 +296,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(Src0); int Src0Idx = NumOperands; - if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { - LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); - Fail = true; - break; - } + DPPInst.add(*Src0); DPPInst->getOperand(NumOperands).setIsKill(false); ++NumOperands; @@ -319,7 +315,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); if (Src1) { - int OpNum = NumOperands; + assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1) && + "dpp version of instruction missing src1"); // If subtarget does not support SGPRs for src1 operand then the // requirements are the same as for src0. We check src0 instead because // pseudos are shared between subtargets and allow SGPR for src1 on all. @@ -327,13 +324,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == getOperandSize(*DPPInst, NumOperands, *MRI) && "Src0 and Src1 operands should have the same size"); - OpNum = Src0Idx; - } - if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) { - LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); - Fail = true; - break; } + DPPInst.add(*Src1); ++NumOperands; } @@ -349,9 +341,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); if (Src2) { - if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || - !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { - LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); + if (!AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src2)) { + LLVM_DEBUG(dbgs() << " failed: dpp does not have src2\n"); Fail = true; break; } @@ -431,6 +422,24 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); DPPInst.addImm(CombBCZ ? 1 : 0); + + constexpr AMDGPU::OpName Srcs[] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; + + // FIXME: isOperandLegal expects to operate on an completely built + // instruction. We should have better legality APIs to check if the + // candidate operands will be legal without building the instruction first. + for (auto [I, OpName] : enumerate(Srcs)) { + int OpIdx = AMDGPU::getNamedOperandIdx(DPPOp, OpName); + if (OpIdx == -1) + break; + + if (!TII->isOperandLegal(*DPPInst, OpIdx)) { + LLVM_DEBUG(dbgs() << " failed: src" << I << " operand is illegal\n"); + Fail = true; + break; + } + } } while (false); if (Fail) { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3979e1e0c44aa..a116b57c85a88 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,6 +173,7 @@ struct FoldCandidate { class SIFoldOperandsImpl { public: + MachineFunction *MF; MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; @@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } MachineOperand *New = Fold.Def.OpToFold; + + // Verify the register is compatible with the operand. + if (const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) { + const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg()); + const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg()); + unsigned NewSubReg = New->getSubReg(); + unsigned OldSubReg = Old.getSubReg(); + + const TargetRegisterClass *ConstrainRC = OpRC; + if (NewSubReg && OldSubReg) { + unsigned PreA, PreB; + ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC, + NewSubReg, PreA, PreB); + } else if (OldSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg); + } else if (NewSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg); + } + + if (!ConstrainRC) + return false; + + if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { + LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) + << TRI->getRegClassName(ConstrainRC) << '\n'); + return false; + } + } + // Rework once the VS_16 register class is updated to include proper // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) @@ -1429,30 +1460,9 @@ void SIFoldOperandsImpl::foldOperand( return; } - if (!FoldingImmLike) { - if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { - // Don't fold if OpToFold doesn't hold an aligned register. - const TargetRegisterClass *RC = - TRI->getRegClassForReg(*MRI, OpToFold.getReg()); - assert(RC); - if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { - unsigned SubReg = OpToFold.getSubReg(); - if (const TargetRegisterClass *SubRC = - TRI->getSubRegisterClass(RC, SubReg)) - RC = SubRC; - } - - if (!RC || !TRI->isProperlyAlignedRC(*RC)) - return; - } - - tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunities. The shrink operands pass - // already does this. - return; - } + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunities. The shrink operands pass + // already does this. tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); } @@ -2747,6 +2757,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { } bool SIFoldOperandsImpl::run(MachineFunction &MF) { + this->MF = &MF; MRI = &MF.getRegInfo(); ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d43924d46b005..c5e8f95748cf1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4572,9 +4572,8 @@ static bool compareMachineOp(const MachineOperand &Op0, } } -bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, +bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const { - const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); @@ -4586,9 +4585,9 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, return false; if (MO.isImm() && isInlineConstant(MO, OpInfo)) { - if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && - OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::src2)) + if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() && + OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(), + AMDGPU::OpName::src2)) return false; return RI.opCanUseInlineConstant(OpInfo.OperandType); } @@ -4596,7 +4595,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) return false; - if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) + if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) return true; return ST.hasVOP3Literal(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index f7c7bb509c9ef..958af0ff1147f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -533,13 +533,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::VOP2; } - static bool isVOP3(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::VOP3; + static bool isVOP3(const MCInstrDesc &Desc) { + return Desc.TSFlags & SIInstrFlags::VOP3; } - bool isVOP3(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP3; - } + static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); } + + bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); } static bool isSDWA(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SDWA; @@ -841,13 +841,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::VINTRP; } - static bool isMAI(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::IsMAI; + static bool isMAI(const MCInstrDesc &Desc) { + return Desc.TSFlags & SIInstrFlags::IsMAI; } - bool isMAI(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::IsMAI; - } + static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); } + + bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); } static bool isMFMA(const MachineInstr &MI) { return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && @@ -1180,9 +1180,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return isInlineConstant(*MO.getParent(), MO.getOperandNo()); } - bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const; + bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + const MachineOperand &MO) const { + return isImmOperandLegal(MI.getDesc(), OpNo, MO); + } + /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. bool isLegalAV64PseudoImm(uint64_t Imm) const; diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index fb20e72a77103..3725384e885ee 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -1,6 +1,6 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN +# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN --- @@ -8,8 +8,7 @@ # GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec # GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec # GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec -# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec -# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec name: vop3 tracksRegLiveness: true body: | @@ -39,12 +38,9 @@ body: | # GCN-LABEL: name: vop3_sgpr_src1 # GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec -# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec -# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec -# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec -# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec -# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec -# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec # GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec name: vop3_sgpr_src1 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir index a0ea04b1b9c0f..8326862706a02 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir @@ -31,9 +31,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_64 = IMPLICIT_DEF %2:areg_64_align2 = COPY killed %1 @@ -105,9 +104,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_96 = IMPLICIT_DEF %2:areg_96_align2 = COPY killed %1 @@ -234,9 +232,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_128 = IMPLICIT_DEF %2:areg_128_align2 = COPY killed %1 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir index a54c0accce783..5f9b71c0c2198 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir @@ -46,9 +46,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64 = IMPLICIT_DEF %2:vreg_64_align2 = COPY killed %1 @@ -148,9 +147,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_96 = IMPLICIT_DEF %2:vreg_96_align2 = COPY killed %1 @@ -326,11 +324,59 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_128 = IMPLICIT_DEF %2:vreg_128_align2 = COPY killed %1 GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, implicit $exec ... + +# Make sure the alignment requirement is respected for VS_64 operand +# uses. +--- +name: aligned_vgpr_vs_64_constraint +tracksRegLiveness: true +isSSA: true +body: | + bb.0.entry: + liveins: $vgpr0, $sgpr8_sgpr9 + + ; GFX908-LABEL: name: aligned_vgpr_vs_64_constraint + ; GFX908: liveins: $vgpr0, $sgpr8_sgpr9 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1) + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0 + ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; GFX908-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) + ; GFX908-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: aligned_vgpr_vs_64_constraint + ; GFX90A: liveins: $vgpr0, $sgpr8_sgpr9 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1) + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; GFX90A-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, killed [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) + ; GFX90A-NEXT: S_ENDPGM 0 + %0:sgpr_64 = COPY $sgpr8_sgpr9 + %1:vgpr_32 = COPY $vgpr0 + %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR %0, %1, 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1) + %3:vgpr_32 = COPY %2.sub0 + %4:vreg_64_align2 = COPY killed %2.sub1_sub2 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64_align2 = REG_SEQUENCE %3, %subreg.sub0, %5, %subreg.sub1 + %7:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed %6, 0, killed %4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B64_gfx9 %5, killed %7, 0, 0, implicit $exec :: (store (s64), addrspace 3) + S_ENDPGM 0 + +...