Skip to content

Conversation

akadutta
Copy link
Contributor

Optimize condition checks, Remove compilation overhead for unsupported archs

@llvmbot
Copy link
Member

llvmbot commented Oct 17, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Akash Dutta (akadutta)

Changes

Optimize condition checks, Remove compilation overhead for unsupported archs


Full diff: https://github.com/llvm/llvm-project/pull/163992.diff

1 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+30-44)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 01a40c1e38817..d9c3d6b399225 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
                              const MachineBasicBlock &From,
                              const MachineBasicBlock &To) const;
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-  // Check if the machine instruction being processed is a supported packed
-  // instruction.
-  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   // Creates a list of packed instructions following an MFMA that are suitable
   // for unpacking.
   void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  if (!TII->isNeverCoissue(MI))
-    return false;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_FMA_F32:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Fully covered switch");
-}
-
 bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
   unsigned OpCode = MI.getOpcode();
   Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
+    uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
     if (Instr.isMetaInstruction())
       continue;
     if ((Instr.isTerminator()) ||
-        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (TII->isNeverCoissue(Instr) &&
+         (UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
         (SIInstrInfo::modifiesModeRegister(Instr) &&
          Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
       return;
@@ -639,7 +621,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
       if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
         return;
     }
-    if (!isUnpackingSupportedInstr(Instr))
+    if (UnpackedOpCode == std::numeric_limits<uint16_t>::max())
       continue;
 
     if (canUnpackingClobberRegister(Instr))
@@ -687,8 +669,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
                                                         bool IsHiBits) {
   MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
-  const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned OpCode = I.getOpcode();
   Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +684,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
 
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(UnpackedDstReg); // vdst
-  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
-  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
 
   if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
-    const MachineOperand *SrcMO3 =
+    const MachineOperand *SrcMO2 =
         TII->getNamedOperand(I, AMDGPU::OpName::src2);
     unsigned Src2Mods =
         TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
-    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
   }
   NewMI.addImm(ClampVal); // clamp
   // Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,22 +769,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
 
   // TODO: Fold this into previous block, if possible. Evaluate and handle any
   // side effects.
-  for (MachineBasicBlock &MBB : MF) {
-    // Unpack packed instructions overlapped by MFMAs. This allows the compiler
-    // to co-issue unpacked instructions with MFMA
-    auto SchedModel = TII->getSchedModel();
-    SetVector<MachineInstr *> InstrsToUnpack;
-    for (auto &MI : make_early_inc_range(MBB.instrs())) {
-      if (!SIInstrInfo::isMFMA(MI))
-        continue;
-      const MCSchedClassDesc *SchedClassDesc =
-          SchedModel.resolveSchedClass(&MI);
-      uint16_t NumMFMACycles =
-          SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-      collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
-    }
-    for (MachineInstr *MI : InstrsToUnpack) {
-      performF32Unpacking(*MI);
+
+  // Perform the extra MF scans only for supported archs
+  if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
+    for (MachineBasicBlock &MBB : MF) {
+      // Unpack packed instructions overlapped by MFMAs. This allows the compiler
+      // to co-issue unpacked instructions with MFMA
+      auto SchedModel = TII->getSchedModel();
+      SetVector<MachineInstr *> InstrsToUnpack;
+      for (auto &MI : make_early_inc_range(MBB.instrs())) {
+        if (!SIInstrInfo::isMFMA(MI))
+          continue;
+        const MCSchedClassDesc *SchedClassDesc =
+            SchedModel.resolveSchedClass(&MI);
+        uint16_t NumMFMACycles =
+            SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+        collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+      }
+      for (MachineInstr *MI : InstrsToUnpack) {
+        performF32Unpacking(*MI);
+      }
     }
   }
 

Copy link
Contributor

@jrbyrnes jrbyrnes left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add NFC to commit title.

performF32Unpacking(*MI);

// Perform the extra MF scans only for supported archs
if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
Copy link
Contributor

@jrbyrnes jrbyrnes Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (!(ST.hasGFX950Insts() || ST.hasGFX940Insts()))
	return;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, can just use hasGFX940Insts as this implies gfx950

if ((Instr.isTerminator()) ||
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
(TII->isNeverCoissue(Instr) &&
(UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use a local variable -- IsUnpackable = UnpackedOpCode == std::numeric_limits<uint16_t>::max()

@akadutta akadutta changed the title [AMDGPU]:: Minor Unpacking Fixes. [AMDGPU][NFC]:: Minor Unpacking Fixes. Oct 17, 2025
Copy link

github-actions bot commented Oct 17, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

performF32Unpacking(*MI);

// Perform the extra MF scans only for supported archs
if (ST.hasGFX940Insts()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use early exit to reduce nesting https://llvm.org/docs/CodingStandards.html#use-early-exits-and-continue-to-simplify-code

if (!ST.hasGFX9540Insts())
    return Changed;

for (MachineBasicBlock &MBB : MF) {
...

Copy link
Contributor

@jrbyrnes jrbyrnes left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM -- thanks

@jrbyrnes jrbyrnes merged commit 8b2fc00 into llvm:main Oct 17, 2025
10 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants