From 19decad1b4fc681772386e0d25e31160e9e689f8 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 16 Oct 2025 14:49:26 -0700 Subject: [PATCH] [AMDGPU] Optionally use the downcasted version for SchedGroups Change-Id: Iffc6b6309ba050f139298d88c1dbdb9ab0fe1fd3 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 156 ++++++++--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 35 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 + llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 53 +++- .../CodeGen/AMDGPU/sched.group.downcast.mir | 244 ++++++++++++++++++ 5 files changed, 451 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 5700468e2420e..a1a9b2b7162ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -60,6 +60,17 @@ static cl::opt UseCostHeur( "Experimentally, results are mixed, so this should be set on a " "case-by-case basis.")); +static cl::opt UseDowncastOps( + "amdgpu-igrouplp-use-downcast-ops", cl::Hidden, + cl::desc("Whether to use the downcast alternative OpCodes instead of the " + "current OpCode. Under certain conditions, some OpCodes may be " + "downcast " + "to an alternative sequence after scheduling (e.g. V_PK_MUL_F32 " + "-> V_MUL_F32). " + "This flag enables SchedGroup classification based on the " + "alternative."), + cl::init(false)); + // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. enum class SchedGroupMask { @@ -133,6 +144,8 @@ class SchedGroup { // SGID is used to map instructions to candidate SchedGroups unsigned SGID; + unsigned CurrentSize = 0; + // The different rules each instruction in this SchedGroup must conform to SmallVector, 4> Rules; @@ -143,9 +156,14 @@ class SchedGroup { bool tryAddEdge(SUnit *A, SUnit *B); // Use SGMask to determine whether we can classify MI as a member of this - // SchedGroup object. + // SchedGroup object. If UseDowncastOps is specified, and this is a candidate + // for downcasting, then use the DownCasted OpCodes. bool canAddMI(const MachineInstr &MI) const; + // Use SGMask to determine whether we can classify an opcode as a member of + // this SchedGroup object. + bool canAddSingleMI(unsigned Opcode, bool MayLoad, bool MayStore) const; + public: // Collection of SUnits that are classified as members of this group. SmallVector Collection; @@ -176,7 +194,7 @@ class SchedGroup { void link(SchedGroup &OtherGroup); // Returns true if no more instructions may be added to this group. - bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } + bool isFull() const { return MaxSize && CurrentSize >= *MaxSize; } // Append a constraint that SUs must meet in order to fit into this // SchedGroup. Since many rules involve the relationship between a SchedGroup @@ -202,10 +220,55 @@ class SchedGroup { << format_hex((int)SGMask, 10, true) << " adding " << *SU.getInstr()); Collection.push_back(&SU); + MachineInstr &MI = *SU.getInstr(); + if (!UseDowncastOps || MI.isMetaInstruction()) { + ++CurrentSize; + return; + } + + SmallVector UnpackSequence; + if (!TII->getDowncastSequence(MI, UnpackSequence, + DAG->MF.getSubtarget())) { + ++CurrentSize; + return; + } + + for (unsigned UnpackOp : UnpackSequence) { + if (canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore())) + ++CurrentSize; + } } // Remove last element in the SchedGroup - void pop() { Collection.pop_back(); } + void pop() { + SUnit *SU = Collection.pop_back_val(); + MachineInstr &MI = *SU->getInstr(); + if (!UseDowncastOps || MI.isMetaInstruction()) { + assert(CurrentSize >= 1); + --CurrentSize; + return; + } + + SmallVector UnpackSequence; + if (!TII->getDowncastSequence(MI, UnpackSequence, + DAG->MF.getSubtarget())) { + assert(CurrentSize >= 1); + --CurrentSize; + return; + } + + for (unsigned UnpackOp : UnpackSequence) { + if (canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore())) { + assert(CurrentSize >= 1); + --CurrentSize; + } + } + } + + void clear() { + Collection.clear(); + CurrentSize = 0; + } // Identify and add all relevant SUs from the DAG to this SchedGroup. void initSchedGroup(); @@ -371,16 +434,16 @@ class PipelineSolver { }; void PipelineSolver::reset() { - for (auto &SyncPipeline : CurrPipeline) { for (auto &SG : SyncPipeline) { SmallVector TempCollection = SG.Collection; - SG.Collection.clear(); + SG.clear(); auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; }); - if (SchedBarr != TempCollection.end()) - SG.Collection.push_back(*SchedBarr); + if (SchedBarr != TempCollection.end()) { + SG.add(**SchedBarr); + } } } @@ -2386,64 +2449,99 @@ bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { return false; } -bool SchedGroup::canAddMI(const MachineInstr &MI) const { +bool SchedGroup::canAddSingleMI(unsigned Opcode, bool MayLoad, + bool MayStore) const { bool Result = false; - if (MI.isMetaInstruction()) - Result = false; - else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && - (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || - TII->isTRANS(MI))) - Result = !MI.mayLoadOrStore(); + if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && + (TII->isVALU(Opcode) || TII->isMFMAorWMMA(Opcode) || + TII->isSALU(Opcode) || TII->isTRANS(Opcode))) + Result = !(MayLoad || MayStore); else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && - TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) { + TII->isVALU(Opcode) && !TII->isMFMAorWMMA(Opcode) && + !TII->isTRANS(Opcode)) { // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS). // For our purposes, these shall not be classified as VALU as this results // in unexpected behavior. - Result = !MI.mayLoadOrStore(); + Result = !(MayLoad || MayStore); } else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && - TII->isSALU(MI)) - Result = !MI.mayLoadOrStore(); + TII->isSALU(Opcode)) + Result = !(MayLoad || MayStore); else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && - TII->isMFMAorWMMA(MI)) + TII->isMFMAorWMMA(Opcode)) Result = true; else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && - TII->isVMEM(MI)) + (TII->isVMEM(Opcode) || TII->isFLAT(Opcode))) Result = true; else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && - MI.mayLoad() && TII->isVMEM(MI)) + MayLoad && (TII->isVMEM(Opcode) || TII->isFLAT(Opcode))) Result = true; else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && - MI.mayStore() && TII->isVMEM(MI)) + MayStore && (TII->isVMEM(Opcode) || TII->isFLAT(Opcode))) Result = true; else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && - TII->isDS(MI)) + TII->isDS(Opcode)) Result = true; else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && - MI.mayLoad() && TII->isDS(MI)) + MayLoad && TII->isDS(Opcode)) Result = true; else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && - MI.mayStore() && TII->isDS(MI)) + MayStore && TII->isDS(Opcode)) Result = true; else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && - TII->isTRANS(MI)) + TII->isTRANS(Opcode)) Result = true; - LLVM_DEBUG( - dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) - << (Result ? " could classify " : " unable to classify ") << MI); + return Result; +} + +bool SchedGroup::canAddMI(const MachineInstr &MI) const { + bool Result = false; + + auto emitDebug = [this](const MachineInstr &MI, bool Result) { + LLVM_DEBUG(dbgs() << "For SchedGroup with mask " + << format_hex((int)SGMask, 10, true) + << (Result ? " could classify " : " unable to classify ") + << MI); + }; + + if (MI.isMetaInstruction()) { + emitDebug(MI, false); + return false; + } + + if (!UseDowncastOps) { + Result = canAddSingleMI(MI.getOpcode(), MI.mayLoad(), MI.mayStore()); + emitDebug(MI, Result); + return Result; + } + + SmallVector UnpackSequence; + if (!TII->getDowncastSequence(MI, UnpackSequence, + DAG->MF.getSubtarget())) { + Result = canAddSingleMI(MI.getOpcode(), MI.mayLoad(), MI.mayStore()); + emitDebug(MI, Result); + return Result; + } + + // We have an unpackable MI, check if the unpack OpCodes are classifiable by + // this mask. + for (unsigned UnpackOp : UnpackSequence) { + Result |= canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore()); + } + emitDebug(MI, Result); return Result; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 50447f48a628c..17f5789afdd4c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6366,6 +6366,41 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } +bool SIInstrInfo::getDowncastSequence(const MachineInstr &MI, + SmallVectorImpl &Sequence, + const GCNSubtarget &ST) const { + bool isGFX940Plus = ST.hasGFX940Insts(); + switch (MI.getOpcode()) { + // Use 64 bit encoding to allow use of VOP3 instructions. + // VOP3 e64 instructions allow source modifiers + // e32 instructions don't allow source modifiers. + case AMDGPU::V_PK_ADD_F32: { + if (!isGFX940Plus) + return false; + Sequence.push_back(AMDGPU::V_ADD_F32_e64); + Sequence.push_back(AMDGPU::V_ADD_F32_e64); + return true; + } + case AMDGPU::V_PK_MUL_F32: { + if (!isGFX940Plus) + return false; + Sequence.push_back(AMDGPU::V_MUL_F32_e64); + Sequence.push_back(AMDGPU::V_MUL_F32_e64); + return true; + } + case AMDGPU::V_PK_FMA_F32: { + if (!isGFX940Plus) + return false; + Sequence.push_back(AMDGPU::V_FMA_F32_e64); + Sequence.push_back(AMDGPU::V_FMA_F32_e64); + return true; + } + default: + return false; + } + llvm_unreachable("Fully covered switch"); +} + bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { bool IsGFX950Only = ST.hasGFX950Insts(); bool IsGFX940Only = ST.hasGFX940Insts(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index df27ec1f8de8c..e51f3b996e250 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1237,6 +1237,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isNeverCoissue(MachineInstr &MI) const; + bool getDowncastSequence(const MachineInstr &MI, + SmallVectorImpl &Sequence, + const GCNSubtarget &ST) const; + /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. bool isLegalAV64PseudoImm(uint64_t Imm) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e111ec862..b06c3f0a89399 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -51,7 +51,8 @@ class SIPreEmitPeephole { // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, SetVector &InstrsToUnpack, - uint16_t NumMFMACycles); + uint16_t NumMFMACycles, + const GCNSubtarget &ST); // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1] // op_sel_hi:[0,0,0] // ==> @@ -63,7 +64,7 @@ class SIPreEmitPeephole { // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for // this transformation. - void performF32Unpacking(MachineInstr &I); + void performF32Unpacking(MachineInstr &I, const GCNSubtarget &ST); // Select corresponding unpacked instruction uint16_t mapToUnpackedOpcode(MachineInstr &I); // Creates the unpacked instruction to be inserted. Adds source modifiers to @@ -583,20 +584,33 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI, void SIPreEmitPeephole::collectUnpackingCandidates( MachineInstr &BeginMI, SetVector &InstrsToUnpack, - uint16_t NumMFMACycles) { + uint16_t NumMFMACycles, const GCNSubtarget &ST) { auto *BB = BeginMI.getParent(); auto E = BB->end(); int TotalCyclesBetweenCandidates = 0; auto SchedModel = TII->getSchedModel(); + const MCSchedModel *MCSchedMod = SchedModel.getMCSchedModel(); Register MFMADef = BeginMI.getOperand(0).getReg(); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; - uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); - bool IsUnpackable = - !(UnpackedOpCode == std::numeric_limits::max()); if (Instr.isMetaInstruction()) continue; + + SmallVector UnpackSequence; + bool IsUnpackable = TII->getDowncastSequence(Instr, UnpackSequence, ST); + + // We only support unpacking where the unpack sequence is all the same + // opcode. To support more complex sequences we must teach + // performF32Unpacking how to handle them. The unpack sequence used in + // performF32Unpacking must agree with with TII->getDowncastSequence, as + // this method is used for some scheduling decisions, under the assumption + // that this will be sequence used for unpacking. + IsUnpackable &= + all_of(UnpackSequence, [&UnpackSequence](unsigned CurrentOpcode) { + return CurrentOpcode == UnpackSequence[0]; + }); + if ((Instr.isTerminator()) || (TII->isNeverCoissue(Instr) && !IsUnpackable) || (SIInstrInfo::modifiesModeRegister(Instr) && @@ -631,18 +645,33 @@ void SIPreEmitPeephole::collectUnpackingCandidates( // latency, add latency of two unpacked instructions (currently estimated // as 2 cycles). TotalCyclesBetweenCandidates -= Latency; - // TODO: improve latency handling based on instruction modeling. - TotalCyclesBetweenCandidates += 2; + + for (unsigned Opcode : UnpackSequence) { + unsigned SchedClass = TII->get(Opcode).getSchedClass(); + const MCSchedClassDesc *SCDesc = + MCSchedMod->getSchedClassDesc(SchedClass); + + // FIXME: We don't have an opcode based SchedClass resolution for variant + // SchedClass. This is a non-issue currently as none of the unpack + // instructions have variant SchedClasses. + assert(!SCDesc->isVariant()); + uint16_t Latency = + SchedModel.getWriteProcResBegin(SCDesc)->ReleaseAtCycle; + TotalCyclesBetweenCandidates += Latency; + } // Subtract 1 to account for MFMA issue latency. if (TotalCyclesBetweenCandidates < NumMFMACycles - 1) InstrsToUnpack.insert(&Instr); } } -void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { +void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I, + const GCNSubtarget &ST) { MachineOperand DstOp = I.getOperand(0); - uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + SmallVector UnpackSequence; + TII->getDowncastSequence(I, UnpackSequence, ST); + uint16_t UnpackedOpcode = UnpackSequence[0]; assert(UnpackedOpcode != std::numeric_limits::max() && "Unsupported Opcode"); @@ -786,10 +815,10 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { SchedModel.resolveSchedClass(&MI); uint16_t NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles, ST); } for (MachineInstr *MI : InstrsToUnpack) { - performF32Unpacking(*MI); + performF32Unpacking(*MI, ST); } } diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir b/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir new file mode 100644 index 0000000000000..5f16e7ddfd090 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir @@ -0,0 +1,244 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s -check-prefixes=DEFAULT,GCN +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-igrouplp-use-downcast-ops=1 -run-pass=machine-scheduler -o - %s | FileCheck %s -check-prefixes=DOWNCAST,GCN + + +# default will result in the prescribed pipeline, since amdgpu-igrouplp-use-downcast-ops thinks there are 8 VALU. + +--- +name: 2xVALU_1xSALU_2xVALU_1xSALU +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; DEFAULT-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU + ; DEFAULT: liveins: $vgpr0_vgpr1 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $exec = IMPLICIT_DEF + ; DEFAULT-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; DEFAULT-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc + ; DEFAULT-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DEFAULT-NEXT: S_ENDPGM 0 + ; + ; DOWNCAST-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU + ; DOWNCAST: liveins: $vgpr0_vgpr1 + ; DOWNCAST-NEXT: {{ $}} + ; DOWNCAST-NEXT: $exec = IMPLICIT_DEF + ; DOWNCAST-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; DOWNCAST-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc + ; DOWNCAST-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DOWNCAST-NEXT: S_ENDPGM 0 + $exec = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF + %1:sgpr_128 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + %3:vreg_64_align2 = IMPLICIT_DEF + %4:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %3, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %5:vreg_64_align2 = V_PK_ADD_F32 8, %3, 8, %4, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %6:vreg_64_align2 = V_PK_ADD_F32 8, %4, 8, %5, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %7:vreg_64_align2 = V_PK_ADD_F32 8, %5, 8, %6, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %8:sgpr_32 = IMPLICIT_DEF + %9:sgpr_32 = IMPLICIT_DEF + %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc + %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + S_ENDPGM 0 +... + +# amdgpu-igrouplp-use-downcast-ops should have no effect since the ops aren't candidates for downcast + +--- +name: 2xVALU_1xSALU_2xVALU_1xSALU_nonunpack +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GCN-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU_nonunpack + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = IMPLICIT_DEF + ; GCN-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF2]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF3]], 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc + ; GCN-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_ADD_F32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc + ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; GCN-NEXT: S_ENDPGM 0 + $exec = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF + %1:sgpr_128 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = V_ADD_F32_e64 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec + %5:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_32 = V_ADD_F32_e64 0, %4, 0, %5, 0, 0, implicit $mode, implicit $exec + %7:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec + %8:sgpr_32 = IMPLICIT_DEF + %9:sgpr_32 = IMPLICIT_DEF + %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc + %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + S_ENDPGM 0 +... + +# amdgpu-igrouplp-use-downcast-ops should schedule 2 v_pk between the SALU, since v_pk will be unpacked. + +--- +name: 4xVALU_1xSALU_4xVALU_1xSALU +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; DEFAULT-LABEL: name: 4xVALU_1xSALU_4xVALU_1xSALU + ; DEFAULT: liveins: $vgpr0_vgpr1 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $exec = IMPLICIT_DEF + ; DEFAULT-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; DEFAULT-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc + ; DEFAULT-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DEFAULT-NEXT: S_ENDPGM 0 + ; + ; DOWNCAST-LABEL: name: 4xVALU_1xSALU_4xVALU_1xSALU + ; DOWNCAST: liveins: $vgpr0_vgpr1 + ; DOWNCAST-NEXT: {{ $}} + ; DOWNCAST-NEXT: $exec = IMPLICIT_DEF + ; DOWNCAST-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; DOWNCAST-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DOWNCAST-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc + ; DOWNCAST-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DOWNCAST-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; DOWNCAST-NEXT: S_ENDPGM 0 + $exec = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF + %1:sgpr_128 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + %3:vreg_64_align2 = IMPLICIT_DEF + %4:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %3, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %5:vreg_64_align2 = V_PK_ADD_F32 8, %3, 8, %4, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %6:vreg_64_align2 = V_PK_ADD_F32 8, %4, 8, %5, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %7:vreg_64_align2 = V_PK_ADD_F32 8, %5, 8, %6, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %8:sgpr_32 = IMPLICIT_DEF + %9:sgpr_32 = IMPLICIT_DEF + %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc + %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc + SCHED_GROUP_BARRIER 2, 4, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 4, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + S_ENDPGM 0 +... + +# amdgpu-igrouplp-use-downcast-ops should have no effect since the ops aren't candidates for downcast + +--- +name: 4xVALU_1xSALU_4xVALU_1xSALU_nonunpack +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GCN-LABEL: name: 4xVALU_1xSALU_4xVALU_1xSALU_nonunpack + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = IMPLICIT_DEF + ; GCN-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF2]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF3]], 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_ADD_F32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc + ; GCN-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc + ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; GCN-NEXT: S_ENDPGM 0 + $exec = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF + %1:sgpr_128 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = V_ADD_F32_e64 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec + %5:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_32 = V_ADD_F32_e64 0, %4, 0, %5, 0, 0, implicit $mode, implicit $exec + %7:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec + %8:sgpr_32 = IMPLICIT_DEF + %9:sgpr_32 = IMPLICIT_DEF + %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc + %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc + SCHED_GROUP_BARRIER 2, 4, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 4, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + S_ENDPGM 0 +...