Skip to content

Conversation

jrbyrnes
Copy link
Contributor

Adds an option to use the unpack sequence of instructions when making SchedGroup assignment decisions.

To facilitate this, this also adds TII->getDowncastSequence -- this is meant to force agreement between the unpacker and scheduler related things on how the unpacking will be done.

Change-Id: Iffc6b6309ba050f139298d88c1dbdb9ab0fe1fd3
@llvmbot
Copy link
Member

llvmbot commented Oct 17, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

Changes

Adds an option to use the unpack sequence of instructions when making SchedGroup assignment decisions.

To facilitate this, this also adds TII->getDowncastSequence -- this is meant to force agreement between the unpacker and scheduler related things on how the unpacking will be done.


Patch is 30.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164024.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+127-29)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+35)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+4)
  • (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+41-12)
  • (added) llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir (+244)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468e2420e..a1a9b2b7162ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -60,6 +60,17 @@ static cl::opt<bool> UseCostHeur(
              "Experimentally, results are mixed, so this should be set on a "
              "case-by-case basis."));
 
+static cl::opt<bool> UseDowncastOps(
+    "amdgpu-igrouplp-use-downcast-ops", cl::Hidden,
+    cl::desc("Whether to use the downcast alternative OpCodes instead of the "
+             "current OpCode. Under certain conditions, some OpCodes may be "
+             "downcast "
+             "to an alternative sequence after scheduling (e.g. V_PK_MUL_F32 "
+             "-> V_MUL_F32). "
+             "This flag enables SchedGroup classification based on the "
+             "alternative."),
+    cl::init(false));
+
 // Components of the mask that determines which instruction types may be may be
 // classified into a SchedGroup.
 enum class SchedGroupMask {
@@ -133,6 +144,8 @@ class SchedGroup {
   // SGID is used to map instructions to candidate SchedGroups
   unsigned SGID;
 
+  unsigned CurrentSize = 0;
+
   // The different rules each instruction in this SchedGroup must conform to
   SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
 
@@ -143,9 +156,14 @@ class SchedGroup {
   bool tryAddEdge(SUnit *A, SUnit *B);
 
   // Use SGMask to determine whether we can classify MI as a member of this
-  // SchedGroup object.
+  // SchedGroup object. If UseDowncastOps is specified, and this is a candidate
+  // for downcasting, then use the DownCasted OpCodes.
   bool canAddMI(const MachineInstr &MI) const;
 
+  // Use SGMask to determine whether we can classify an opcode as a member of
+  // this SchedGroup object.
+  bool canAddSingleMI(unsigned Opcode, bool MayLoad, bool MayStore) const;
+
 public:
   // Collection of SUnits that are classified as members of this group.
   SmallVector<SUnit *, 32> Collection;
@@ -176,7 +194,7 @@ class SchedGroup {
   void link(SchedGroup &OtherGroup);
 
   // Returns true if no more instructions may be added to this group.
-  bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
+  bool isFull() const { return MaxSize && CurrentSize >= *MaxSize; }
 
   // Append a constraint that SUs must meet in order to fit into this
   // SchedGroup. Since many rules involve the relationship between a SchedGroup
@@ -202,10 +220,55 @@ class SchedGroup {
                       << format_hex((int)SGMask, 10, true) << " adding "
                       << *SU.getInstr());
     Collection.push_back(&SU);
+    MachineInstr &MI = *SU.getInstr();
+    if (!UseDowncastOps || MI.isMetaInstruction()) {
+      ++CurrentSize;
+      return;
+    }
+
+    SmallVector<unsigned, 4> UnpackSequence;
+    if (!TII->getDowncastSequence(MI, UnpackSequence,
+                                  DAG->MF.getSubtarget<GCNSubtarget>())) {
+      ++CurrentSize;
+      return;
+    }
+
+    for (unsigned UnpackOp : UnpackSequence) {
+      if (canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore()))
+        ++CurrentSize;
+    }
   }
 
   // Remove last element in the SchedGroup
-  void pop() { Collection.pop_back(); }
+  void pop() {
+    SUnit *SU = Collection.pop_back_val();
+    MachineInstr &MI = *SU->getInstr();
+    if (!UseDowncastOps || MI.isMetaInstruction()) {
+      assert(CurrentSize >= 1);
+      --CurrentSize;
+      return;
+    }
+
+    SmallVector<unsigned, 4> UnpackSequence;
+    if (!TII->getDowncastSequence(MI, UnpackSequence,
+                                  DAG->MF.getSubtarget<GCNSubtarget>())) {
+      assert(CurrentSize >= 1);
+      --CurrentSize;
+      return;
+    }
+
+    for (unsigned UnpackOp : UnpackSequence) {
+      if (canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore())) {
+        assert(CurrentSize >= 1);
+        --CurrentSize;
+      }
+    }
+  }
+
+  void clear() {
+    Collection.clear();
+    CurrentSize = 0;
+  }
 
   // Identify and add all relevant SUs from the DAG to this SchedGroup.
   void initSchedGroup();
@@ -371,16 +434,16 @@ class PipelineSolver {
 };
 
 void PipelineSolver::reset() {
-
   for (auto &SyncPipeline : CurrPipeline) {
     for (auto &SG : SyncPipeline) {
       SmallVector<SUnit *, 32> TempCollection = SG.Collection;
-      SG.Collection.clear();
+      SG.clear();
       auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
         return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
       });
-      if (SchedBarr != TempCollection.end())
-        SG.Collection.push_back(*SchedBarr);
+      if (SchedBarr != TempCollection.end()) {
+        SG.add(**SchedBarr);
+      }
     }
   }
 
@@ -2386,64 +2449,99 @@ bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
   return false;
 }
 
-bool SchedGroup::canAddMI(const MachineInstr &MI) const {
+bool SchedGroup::canAddSingleMI(unsigned Opcode, bool MayLoad,
+                                bool MayStore) const {
   bool Result = false;
-  if (MI.isMetaInstruction())
-    Result = false;
 
-  else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
-           (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
-            TII->isTRANS(MI)))
-    Result = !MI.mayLoadOrStore();
+  if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
+      (TII->isVALU(Opcode) || TII->isMFMAorWMMA(Opcode) ||
+       TII->isSALU(Opcode) || TII->isTRANS(Opcode)))
+    Result = !(MayLoad || MayStore);
 
   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
-           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+           TII->isVALU(Opcode) && !TII->isMFMAorWMMA(Opcode) &&
+           !TII->isTRANS(Opcode)) {
     // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
     // For our purposes, these shall not be classified as VALU as this results
     // in unexpected behavior.
-    Result = !MI.mayLoadOrStore();
+    Result = !(MayLoad || MayStore);
   }
 
   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
-           TII->isSALU(MI))
-    Result = !MI.mayLoadOrStore();
+           TII->isSALU(Opcode))
+    Result = !(MayLoad || MayStore);
 
   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
-           TII->isMFMAorWMMA(MI))
+           TII->isMFMAorWMMA(Opcode))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
-           TII->isVMEM(MI))
+           (TII->isVMEM(Opcode) || TII->isFLAT(Opcode)))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
-           MI.mayLoad() && TII->isVMEM(MI))
+           MayLoad && (TII->isVMEM(Opcode) || TII->isFLAT(Opcode)))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
-           MI.mayStore() && TII->isVMEM(MI))
+           MayStore && (TII->isVMEM(Opcode) || TII->isFLAT(Opcode)))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
-           TII->isDS(MI))
+           TII->isDS(Opcode))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
-           MI.mayLoad() && TII->isDS(MI))
+           MayLoad && TII->isDS(Opcode))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
-           MI.mayStore() && TII->isDS(MI))
+           MayStore && TII->isDS(Opcode))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
-           TII->isTRANS(MI))
+           TII->isTRANS(Opcode))
     Result = true;
 
-  LLVM_DEBUG(
-      dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
-             << (Result ? " could classify " : " unable to classify ") << MI);
+  return Result;
+}
+
+bool SchedGroup::canAddMI(const MachineInstr &MI) const {
+  bool Result = false;
+
+  auto emitDebug = [this](const MachineInstr &MI, bool Result) {
+    LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
+                      << format_hex((int)SGMask, 10, true)
+                      << (Result ? " could classify " : " unable to classify ")
+                      << MI);
+  };
+
+  if (MI.isMetaInstruction()) {
+    emitDebug(MI, false);
+    return false;
+  }
+
+  if (!UseDowncastOps) {
+    Result = canAddSingleMI(MI.getOpcode(), MI.mayLoad(), MI.mayStore());
+    emitDebug(MI, Result);
+    return Result;
+  }
+
+  SmallVector<unsigned, 4> UnpackSequence;
+  if (!TII->getDowncastSequence(MI, UnpackSequence,
+                                DAG->MF.getSubtarget<GCNSubtarget>())) {
+    Result = canAddSingleMI(MI.getOpcode(), MI.mayLoad(), MI.mayStore());
+    emitDebug(MI, Result);
+    return Result;
+  }
+
+  // We have an unpackable MI, check if the unpack OpCodes are classifiable by
+  // this mask.
+  for (unsigned UnpackOp : UnpackSequence) {
+    Result |= canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore());
+  }
 
+  emitDebug(MI, Result);
   return Result;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 50447f48a628c..17f5789afdd4c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6366,6 +6366,41 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+bool SIInstrInfo::getDowncastSequence(const MachineInstr &MI,
+                                      SmallVectorImpl<unsigned> &Sequence,
+                                      const GCNSubtarget &ST) const {
+  bool isGFX940Plus = ST.hasGFX940Insts();
+  switch (MI.getOpcode()) {
+  // Use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 e64 instructions allow source modifiers
+  // e32 instructions don't allow source modifiers.
+  case AMDGPU::V_PK_ADD_F32: {
+    if (!isGFX940Plus)
+      return false;
+    Sequence.push_back(AMDGPU::V_ADD_F32_e64);
+    Sequence.push_back(AMDGPU::V_ADD_F32_e64);
+    return true;
+  }
+  case AMDGPU::V_PK_MUL_F32: {
+    if (!isGFX940Plus)
+      return false;
+    Sequence.push_back(AMDGPU::V_MUL_F32_e64);
+    Sequence.push_back(AMDGPU::V_MUL_F32_e64);
+    return true;
+  }
+  case AMDGPU::V_PK_FMA_F32: {
+    if (!isGFX940Plus)
+      return false;
+    Sequence.push_back(AMDGPU::V_FMA_F32_e64);
+    Sequence.push_back(AMDGPU::V_FMA_F32_e64);
+    return true;
+  }
+  default:
+    return false;
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
 bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
   bool IsGFX950Only = ST.hasGFX950Insts();
   bool IsGFX940Only = ST.hasGFX940Insts();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index df27ec1f8de8c..e51f3b996e250 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1237,6 +1237,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isNeverCoissue(MachineInstr &MI) const;
 
+  bool getDowncastSequence(const MachineInstr &MI,
+                           SmallVectorImpl<unsigned> &Sequence,
+                           const GCNSubtarget &ST) const;
+
   /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
   bool isLegalAV64PseudoImm(uint64_t Imm) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..b06c3f0a89399 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -51,7 +51,8 @@ class SIPreEmitPeephole {
   // for unpacking.
   void collectUnpackingCandidates(MachineInstr &BeginMI,
                                   SetVector<MachineInstr *> &InstrsToUnpack,
-                                  uint16_t NumMFMACycles);
+                                  uint16_t NumMFMACycles,
+                                  const GCNSubtarget &ST);
   // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
   // op_sel_hi:[0,0,0]
   // ==>
@@ -63,7 +64,7 @@ class SIPreEmitPeephole {
   // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
   // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
   // this transformation.
-  void performF32Unpacking(MachineInstr &I);
+  void performF32Unpacking(MachineInstr &I, const GCNSubtarget &ST);
   // Select corresponding unpacked instruction
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
   // Creates the unpacked instruction to be inserted. Adds source modifiers to
@@ -583,20 +584,33 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
 
 void SIPreEmitPeephole::collectUnpackingCandidates(
     MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
-    uint16_t NumMFMACycles) {
+    uint16_t NumMFMACycles, const GCNSubtarget &ST) {
   auto *BB = BeginMI.getParent();
   auto E = BB->end();
   int TotalCyclesBetweenCandidates = 0;
   auto SchedModel = TII->getSchedModel();
+  const MCSchedModel *MCSchedMod = SchedModel.getMCSchedModel();
   Register MFMADef = BeginMI.getOperand(0).getReg();
 
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
-    bool IsUnpackable =
-        !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
     if (Instr.isMetaInstruction())
       continue;
+
+    SmallVector<unsigned, 4> UnpackSequence;
+    bool IsUnpackable = TII->getDowncastSequence(Instr, UnpackSequence, ST);
+
+    // We only support unpacking where the unpack sequence is all the same
+    // opcode. To support more complex sequences we must teach
+    // performF32Unpacking how to handle them. The unpack sequence used in
+    // performF32Unpacking must agree with with TII->getDowncastSequence, as
+    // this method is used for some scheduling decisions, under the assumption
+    // that this will be sequence used for unpacking.
+    IsUnpackable &=
+        all_of(UnpackSequence, [&UnpackSequence](unsigned CurrentOpcode) {
+          return CurrentOpcode == UnpackSequence[0];
+        });
+
     if ((Instr.isTerminator()) ||
         (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
         (SIInstrInfo::modifiesModeRegister(Instr) &&
@@ -631,18 +645,33 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
     // latency, add latency of two unpacked instructions (currently estimated
     // as 2 cycles).
     TotalCyclesBetweenCandidates -= Latency;
-    // TODO: improve latency handling based on instruction modeling.
-    TotalCyclesBetweenCandidates += 2;
+
+    for (unsigned Opcode : UnpackSequence) {
+      unsigned SchedClass = TII->get(Opcode).getSchedClass();
+      const MCSchedClassDesc *SCDesc =
+          MCSchedMod->getSchedClassDesc(SchedClass);
+
+      // FIXME: We don't have an opcode based SchedClass resolution for variant
+      // SchedClass. This is a non-issue currently as none of the unpack
+      // instructions have variant SchedClasses.
+      assert(!SCDesc->isVariant());
+      uint16_t Latency =
+          SchedModel.getWriteProcResBegin(SCDesc)->ReleaseAtCycle;
+      TotalCyclesBetweenCandidates += Latency;
+    }
     // Subtract 1 to account for MFMA issue latency.
     if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
       InstrsToUnpack.insert(&Instr);
   }
 }
 
-void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
+void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I,
+                                            const GCNSubtarget &ST) {
   MachineOperand DstOp = I.getOperand(0);
 
-  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  SmallVector<unsigned, 4> UnpackSequence;
+  TII->getDowncastSequence(I, UnpackSequence, ST);
+  uint16_t UnpackedOpcode = UnpackSequence[0];
   assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
          "Unsupported Opcode");
 
@@ -786,10 +815,10 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
           SchedModel.resolveSchedClass(&MI);
       uint16_t NumMFMACycles =
           SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-      collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+      collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles, ST);
     }
     for (MachineInstr *MI : InstrsToUnpack) {
-      performF32Unpacking(*MI);
+      performF32Unpacking(*MI, ST);
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir b/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir
new file mode 100644
index 0000000000000..5f16e7ddfd090
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir
@@ -0,0 +1,244 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s -check-prefixes=DEFAULT,GCN
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-igrouplp-use-downcast-ops=1 -run-pass=machine-scheduler -o - %s | FileCheck %s -check-prefixes=DOWNCAST,GCN
+
+
+# default will result in the prescribed pipeline, since amdgpu-igrouplp-use-downcast-ops thinks there are 8 VALU.
+
+---
+name: 2xVALU_1xSALU_2xVALU_1xSALU
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; DEFAULT-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU
+    ; DEFAULT: liveins: $vgpr0_vgpr1
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: $exec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+    ; DEFAULT-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+    ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; DEFAULT-NEXT: S_ENDPGM 0
+    ;
+    ; DOWNCAST-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU
+    ; DOWNCAST: liveins: $vgpr0_vgpr1
+    ; DOWNCAST-NEXT: {{  $}}
+    ; DOWNCAST-NEXT: $exec = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DOWNCAST-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; DOWNCAST-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+    ; DOWNCAST-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DOWNCAST-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DOWNCAST-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; DOW...
[truncated]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants