[AMDGPU] Add SchedGroupMask::PACK

jrbyrnes · jrbyrnes · commit 0993c38a2dcf · 2025-04-28T20:32:24.000-07:00
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -76,8 +76,9 @@ enum class SchedGroupMask {
   DS_READ = 1u << 8,
   DS_WRITE = 1u << 9,
   TRANS = 1u << 10,
+  PACK = 1u << 11,
   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
-        DS_READ | DS_WRITE | TRANS,
+        DS_READ | DS_WRITE | TRANS | PACK,
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };
 
@@ -112,7 +113,7 @@ class InstructionRule {
   virtual ~InstructionRule() = default;
 };
 
-using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
+typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
 
 // Classify instructions into groups to enable fine tuned control over the
 // scheduler. These groups may be more specific than current SchedModel
@@ -190,9 +191,13 @@ class SchedGroup {
   // Returns true if the SU matches all rules
   bool allowedByRules(const SUnit *SU,
                       SmallVectorImpl<SchedGroup> &SyncPipe) const {
-    for (auto &Rule : Rules) {
-      if (!Rule.get()->apply(SU, Collection, SyncPipe))
+    if (Rules.empty())
+      return true;
+    for (size_t I = 0; I < Rules.size(); I++) {
+      auto TheRule = Rules[I].get();
+      if (!TheRule->apply(SU, Collection, SyncPipe)) {
         return false;
+      }
     }
     return true;
   }
@@ -240,8 +245,8 @@ class SchedGroup {
   }
 };
 
-using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
-using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
+typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
+typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
 
 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
 // in non-trivial cases. For example, if the requested pipeline is
@@ -290,7 +295,7 @@ class PipelineSolver {
   uint64_t BranchesExplored = 0;
 
   // The direction in which we process the candidate SchedGroups per SU
-  bool IsBottomUp = true;
+  bool IsBottomUp = 1;
 
   // Update indices to fit next conflicting instruction
   void advancePosition();
@@ -344,7 +349,7 @@ class PipelineSolver {
 
   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-                 ScheduleDAGMI *DAG, bool IsBottomUp = true)
+                 ScheduleDAGMI *DAG, bool IsBottomUp = 1)
       : DAG(DAG), SyncedInstrs(SyncedInstrs),
         SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
 
@@ -836,7 +841,7 @@ class IGLPStrategy {
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                                    AMDGPU::SchedulingPhase Phase) = 0;
 
-  bool IsBottomUp = true;
+  bool IsBottomUp = 1;
 
   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : DAG(DAG), TII(TII) {}
@@ -859,7 +864,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 
   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
-    IsBottomUp = true;
+    IsBottomUp = 1;
   }
 };
 
@@ -1328,7 +1333,7 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
 
   MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
-    IsBottomUp = false;
+    IsBottomUp = 0;
   }
 };
 
@@ -1460,7 +1465,9 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
 
   MFMAChains = 0;
   for (auto &MFMAPipeSU : MFMAPipeSUs) {
-    if (is_contained(MFMAChainSeeds, MFMAPipeSU))
+    if (MFMAChainSeeds.size() &&
+        std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
+            MFMAChainSeeds.end())
       continue;
     if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
                      [&TII](SDep &Succ) {
@@ -2039,7 +2046,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 
   MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
-    IsBottomUp = false;
+    IsBottomUp = 0;
   }
 };
 
@@ -2349,7 +2356,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   // created SchedGroup first, and will consider that as the ultimate
   // predecessor group when linking. TOP_DOWN instead links and processes the
   // first created SchedGroup first.
-  bool IsBottomUp = true;
+  bool IsBottomUp = 1;
 
   // The scheduling phase this application of IGLP corresponds with.
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
@@ -2420,6 +2427,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
            TII->isTRANS(MI))
     Result = true;
 
+  else if (((SGMask & SchedGroupMask::PACK) != SchedGroupMask::NONE) &&
+           TII->isVOP3P(MI) && !TII->isMFMAorWMMA(MI))
+    Result = true;
+
   LLVM_DEBUG(
       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
              << (Result ? " could classify " : " unable to classify ") << MI);
@@ -2444,7 +2455,7 @@ int SchedGroup::link(SUnit &SU, bool MakePred,
     // the A->B edge impossible, otherwise it returns true;
     bool Added = tryAddEdge(A, B);
     if (Added)
-      AddedEdges.emplace_back(A, B);
+      AddedEdges.push_back(std::pair(A, B));
     else
       ++MissedEdges;
   }