[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992

akadutta · 2025-10-17T16:48:05Z

Optimize condition checks, Remove compilation overhead for unsupported archs

… archs

llvmbot · 2025-10-17T16:48:38Z

@llvm/pr-subscribers-backend-amdgpu

Author: Akash Dutta (akadutta)

Changes

Optimize condition checks, Remove compilation overhead for unsupported archs

Full diff: https://github.com/llvm/llvm-project/pull/163992.diff

1 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+30-44)

diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 01a40c1e38817..d9c3d6b399225 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
                              const MachineBasicBlock &From,
                              const MachineBasicBlock &To) const;
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-  // Check if the machine instruction being processed is a supported packed
-  // instruction.
-  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   // Creates a list of packed instructions following an MFMA that are suitable
   // for unpacking.
   void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  if (!TII->isNeverCoissue(MI))
-    return false;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_FMA_F32:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Fully covered switch");
-}
-
 bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
   unsigned OpCode = MI.getOpcode();
   Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
+    uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
     if (Instr.isMetaInstruction())
       continue;
     if ((Instr.isTerminator()) ||
-        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (TII->isNeverCoissue(Instr) &&
+         (UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
         (SIInstrInfo::modifiesModeRegister(Instr) &&
          Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
       return;
@@ -639,7 +621,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
       if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
         return;
     }
-    if (!isUnpackingSupportedInstr(Instr))
+    if (UnpackedOpCode == std::numeric_limits<uint16_t>::max())
       continue;
 
     if (canUnpackingClobberRegister(Instr))
@@ -687,8 +669,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
                                                         bool IsHiBits) {
   MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
-  const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned OpCode = I.getOpcode();
   Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +684,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
 
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(UnpackedDstReg); // vdst
-  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
-  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
 
   if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
-    const MachineOperand *SrcMO3 =
+    const MachineOperand *SrcMO2 =
         TII->getNamedOperand(I, AMDGPU::OpName::src2);
     unsigned Src2Mods =
         TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
-    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
   }
   NewMI.addImm(ClampVal); // clamp
   // Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,22 +769,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
 
   // TODO: Fold this into previous block, if possible. Evaluate and handle any
   // side effects.
-  for (MachineBasicBlock &MBB : MF) {
-    // Unpack packed instructions overlapped by MFMAs. This allows the compiler
-    // to co-issue unpacked instructions with MFMA
-    auto SchedModel = TII->getSchedModel();
-    SetVector<MachineInstr *> InstrsToUnpack;
-    for (auto &MI : make_early_inc_range(MBB.instrs())) {
-      if (!SIInstrInfo::isMFMA(MI))
-        continue;
-      const MCSchedClassDesc *SchedClassDesc =
-          SchedModel.resolveSchedClass(&MI);
-      uint16_t NumMFMACycles =
-          SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-      collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
-    }
-    for (MachineInstr *MI : InstrsToUnpack) {
-      performF32Unpacking(*MI);
+
+  // Perform the extra MF scans only for supported archs
+  if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
+    for (MachineBasicBlock &MBB : MF) {
+      // Unpack packed instructions overlapped by MFMAs. This allows the compiler
+      // to co-issue unpacked instructions with MFMA
+      auto SchedModel = TII->getSchedModel();
+      SetVector<MachineInstr *> InstrsToUnpack;
+      for (auto &MI : make_early_inc_range(MBB.instrs())) {
+        if (!SIInstrInfo::isMFMA(MI))
+          continue;
+        const MCSchedClassDesc *SchedClassDesc =
+            SchedModel.resolveSchedClass(&MI);
+        uint16_t NumMFMACycles =
+            SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+        collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+      }
+      for (MachineInstr *MI : InstrsToUnpack) {
+        performF32Unpacking(*MI);
+      }
     }
   }

jrbyrnes

Add NFC to commit title.

jrbyrnes · 2025-10-17T19:18:15Z

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

-      performF32Unpacking(*MI);
+
+  // Perform the extra MF scans only for supported archs
+  if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {


if (!(ST.hasGFX950Insts() || ST.hasGFX940Insts())) return;

Also, can just use hasGFX940Insts as this implies gfx950

jrbyrnes · 2025-10-17T19:21:06Z

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

    if ((Instr.isTerminator()) ||
-        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (TII->isNeverCoissue(Instr) &&
+         (UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||


Can you use a local variable -- IsUnpackable = UnpackedOpCode == std::numeric_limits<uint16_t>::max()

github-actions · 2025-10-17T19:45:31Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jrbyrnes · 2025-10-17T20:12:46Z

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

-      performF32Unpacking(*MI);
+
+  // Perform the extra MF scans only for supported archs
+  if (ST.hasGFX940Insts()) {


Use early exit to reduce nesting https://llvm.org/docs/CodingStandards.html#use-early-exits-and-continue-to-simplify-code

if (!ST.hasGFX9540Insts()) return Changed; for (MachineBasicBlock &MBB : MF) { ...

jrbyrnes

LGTM -- thanks

optimize redundant condition, reduce MF scan overhead for unsupported…

a7b24e0

… archs

llvmbot added the backend:AMDGPU label Oct 17, 2025

clang-formatted

e03e0be

jrbyrnes reviewed Oct 17, 2025

View reviewed changes

akadutta changed the title ~~[AMDGPU]:: Minor Unpacking Fixes.~~ [AMDGPU][NFC]:: Minor Unpacking Fixes. Oct 17, 2025

code cleanup

cab2a0a

clang format

3c89ba8

jrbyrnes reviewed Oct 17, 2025

View reviewed changes

early exit to reduce nesting

98a1cdf

jrbyrnes approved these changes Oct 17, 2025

View reviewed changes

jrbyrnes merged commit 8b2fc00 into llvm:main Oct 17, 2025
10 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992

[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992

Uh oh!

akadutta commented Oct 17, 2025

Uh oh!

llvmbot commented Oct 17, 2025

Uh oh!

jrbyrnes left a comment

Uh oh!

jrbyrnes Oct 17, 2025 •

edited

Loading

Uh oh!

jrbyrnes Oct 17, 2025

Uh oh!

jrbyrnes Oct 17, 2025

Uh oh!

github-actions bot commented Oct 17, 2025 •

edited

Loading

Uh oh!

jrbyrnes Oct 17, 2025

Uh oh!

jrbyrnes left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992

[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992

Uh oh!

Conversation

akadutta commented Oct 17, 2025

Uh oh!

llvmbot commented Oct 17, 2025

Uh oh!

jrbyrnes left a comment

Choose a reason for hiding this comment

Uh oh!

jrbyrnes Oct 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jrbyrnes Oct 17, 2025

Choose a reason for hiding this comment

Uh oh!

jrbyrnes Oct 17, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Oct 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jrbyrnes Oct 17, 2025

Choose a reason for hiding this comment

Uh oh!

jrbyrnes left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

jrbyrnes Oct 17, 2025 •

edited

Loading

github-actions bot commented Oct 17, 2025 •

edited

Loading