[AMDGPU] Refactor out common exec mask opcode patterns (NFCI) #154718

perlfu · 2025-08-21T10:07:34Z

Create utility mechanism for finding wave size dependent opcodes used to manipulate exec/lane masks.

llvmbot · 2025-08-21T10:08:05Z

@llvm/pr-subscribers-backend-amdgpu

Author: Carl Ritson (perlfu)

Changes

Create utility mechanism for finding wave size dependent opcodes used to manipulate exec/lane masks.

Patch is 70.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154718.diff

13 Files Affected:

(added) llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp (+75)
(added) llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h (+52)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+13-23)
(modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
(modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+6-9)
(modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+9-12)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+9-15)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+38-68)
(modified) llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp (+10-13)
(modified) llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (+41-71)
(modified) llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (+42-52)
(modified) llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp (+11-18)
(modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+55-87)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp
new file mode 100644
index 0000000000000..8690afcdeef6d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneMaskUtils.cpp - -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneMaskUtils.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+
+#define DEBUG_TYPE "amdgpu-lane-mask-utils"
+
+using namespace llvm;
+
+namespace llvm::AMDGPU {
+
+LaneMaskConstants::LaneMaskConstants(unsigned WavefrontSize) {
+  if (WavefrontSize == 32) {
+    ExecReg = AMDGPU::EXEC_LO;
+    VccReg = AMDGPU::VCC_LO;
+    AndOpc = AMDGPU::S_AND_B32;
+    AndTermOpc = AMDGPU::S_AND_B32_term;
+    AndN2Opc = AMDGPU::S_ANDN2_B32;
+    AndN2SaveExecOpc = AMDGPU::S_ANDN2_SAVEEXEC_B32;
+    AndN2TermOpc = AMDGPU::S_ANDN2_B32_term;
+    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
+    BfmOpc = AMDGPU::S_BFM_B32;
+    CMovOpc = AMDGPU::S_CMOV_B32;
+    CSelectOpc = AMDGPU::S_CSELECT_B32;
+    MovOpc = AMDGPU::S_MOV_B32;
+    MovTermOpc = AMDGPU::S_MOV_B32_term;
+    OrOpc = AMDGPU::S_OR_B32;
+    OrTermOpc = AMDGPU::S_OR_B32_term;
+    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+    XorOpc = AMDGPU::S_XOR_B32;
+    XorTermOpc = AMDGPU::S_XOR_B32_term;
+    WQMOpc = AMDGPU::S_WQM_B32;
+  } else {
+    ExecReg = AMDGPU::EXEC;
+    VccReg = AMDGPU::VCC;
+    AndOpc = AMDGPU::S_AND_B64;
+    AndTermOpc = AMDGPU::S_AND_B64_term;
+    AndN2Opc = AMDGPU::S_ANDN2_B64;
+    AndN2SaveExecOpc = AMDGPU::S_ANDN2_SAVEEXEC_B64;
+    AndN2TermOpc = AMDGPU::S_ANDN2_B64_term;
+    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
+    BfmOpc = AMDGPU::S_BFM_B64;
+    CMovOpc = AMDGPU::S_CMOV_B64;
+    CSelectOpc = AMDGPU::S_CSELECT_B64;
+    MovOpc = AMDGPU::S_MOV_B64;
+    MovTermOpc = AMDGPU::S_MOV_B64_term;
+    OrOpc = AMDGPU::S_OR_B64;
+    OrTermOpc = AMDGPU::S_OR_B64_term;
+    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+    XorOpc = AMDGPU::S_XOR_B64;
+    XorTermOpc = AMDGPU::S_XOR_B64_term;
+    WQMOpc = AMDGPU::S_WQM_B64;
+  }
+}
+
+static const LaneMaskConstants Wave32LaneMaskConstants(32);
+static const LaneMaskConstants Wave64LaneMaskConstants(64);
+
+const LaneMaskConstants &getLaneMaskConstants(const GCNSubtarget *ST) {
+  unsigned WavefrontSize = ST->getWavefrontSize();
+  assert(WavefrontSize == 32 || WavefrontSize == 64);
+  return WavefrontSize == 32 ? Wave32LaneMaskConstants
+                             : Wave64LaneMaskConstants;
+}
+
+} // end namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
new file mode 100644
index 0000000000000..6c11dbd73ef3b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
@@ -0,0 +1,52 @@
+//===- AMDGPULaneMaskUtils.h - Exec/lane mask helper functions -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
+
+#include "llvm/CodeGen/Register.h"
+
+namespace llvm {
+
+class GCNSubtarget;
+
+namespace AMDGPU {
+
+class LaneMaskConstants {
+public:
+  Register ExecReg;
+  Register VccReg;
+  unsigned AndOpc;
+  unsigned AndTermOpc;
+  unsigned AndN2Opc;
+  unsigned AndN2SaveExecOpc;
+  unsigned AndN2TermOpc;
+  unsigned AndSaveExecOpc;
+  unsigned AndSaveExecTermOpc;
+  unsigned BfmOpc;
+  unsigned CMovOpc;
+  unsigned CSelectOpc;
+  unsigned MovOpc;
+  unsigned MovTermOpc;
+  unsigned OrOpc;
+  unsigned OrTermOpc;
+  unsigned OrSaveExecOpc;
+  unsigned XorOpc;
+  unsigned XorTermOpc;
+  unsigned WQMOpc;
+
+  LaneMaskConstants(unsigned WavefrontSize);
+};
+
+const LaneMaskConstants &getLaneMaskConstants(const GCNSubtarget *ST);
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 237929699dd9d..7c37347139607 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -73,6 +73,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
@@ -783,17 +784,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   MachineFunction *MF = &B.getMF();
 
   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
-  const unsigned MovExecOpc =
-      Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  const unsigned MovExecTermOpc =
-      Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
-
-  const unsigned XorTermOpc = Subtarget.isWave32() ?
-    AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
-  const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
-    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-  const unsigned ExecReg =  Subtarget.isWave32() ?
-    AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  const AMDGPU::LaneMaskConstants &LMC =
+      AMDGPU::getLaneMaskConstants(&Subtarget);
 
 #ifndef NDEBUG
   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
@@ -941,19 +933,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   MRI.setRegClass(CondReg, WaveRC);
 
   // Update EXEC, save the original EXEC value to VCC.
-  B.buildInstr(AndSaveExecOpc)
-    .addDef(NewExec)
-    .addReg(CondReg, RegState::Kill);
+  B.buildInstr(LMC.AndSaveExecOpc)
+      .addDef(NewExec)
+      .addReg(CondReg, RegState::Kill);
 
   MRI.setSimpleHint(NewExec, CondReg);
 
   B.setInsertPt(*BodyBB, BodyBB->end());
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  B.buildInstr(XorTermOpc)
-    .addDef(ExecReg)
-    .addReg(ExecReg)
-    .addReg(NewExec);
+  B.buildInstr(LMC.XorTermOpc)
+      .addDef(LMC.ExecReg)
+      .addReg(LMC.ExecReg)
+      .addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
   // s_cbranch_scc0?
@@ -962,14 +954,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
-  BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
-    .addReg(ExecReg);
+  BuildMI(MBB, MBB.end(), DL, TII->get(LMC.MovOpc), SaveExecReg)
+      .addReg(LMC.ExecReg);
 
   // Restore the EXEC mask after the loop.
   B.setMBB(*RestoreExecBB);
-  B.buildInstr(MovExecTermOpc)
-    .addDef(ExecReg)
-    .addReg(SaveExecReg);
+  B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
 
   // Set the insert point after the original instruction, so any new
   // instructions will be in the remainder.
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..f2baf0787bcf4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -69,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULegalizerInfo.cpp
   AMDGPULibCalls.cpp
   AMDGPUImageIntrinsicOptimizer.cpp
+  AMDGPULaneMaskUtils.cpp
   AMDGPULibFunc.cpp
   AMDGPULowerBufferFatPointers.cpp
   AMDGPULowerKernelArguments.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index dce4e6f993005..64e8e5e4cc00a 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -66,6 +66,7 @@
 
 #include "SIFixSGPRCopies.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -1134,7 +1135,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
 }
 
 void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
-  bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
+  const AMDGPU::LaneMaskConstants &LMC =
+      AMDGPU::getLaneMaskConstants(&MF.getSubtarget<GCNSubtarget>());
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
          ++I) {
@@ -1148,10 +1150,7 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
         Register SCCCopy =
             MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
         I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
-                    MI.getDebugLoc(),
-                    TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
-                                      : AMDGPU::S_CSELECT_B64),
-                    SCCCopy)
+                    MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
                 .addImm(-1)
                 .addImm(0);
         I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
@@ -1161,14 +1160,12 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
         continue;
       }
       if (DstReg == AMDGPU::SCC) {
-        unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-        Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
         Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
         I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
-                    MI.getDebugLoc(), TII->get(Opcode))
+                    MI.getDebugLoc(), TII->get(LMC.AndOpc))
                 .addReg(Tmp, getDefRegState(true))
                 .addReg(SrcReg)
-                .addReg(Exec);
+                .addReg(LMC.ExecReg);
         MI.eraseFromParent();
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9b348d46fec4f..7895c518103d5 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,6 +8,7 @@
 
 #include "SIFrameLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
@@ -984,6 +985,7 @@ void SIFrameLowering::emitCSRSpillStores(
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
 
   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
@@ -1015,8 +1017,7 @@ void SIFrameLowering::emitCSRSpillStores(
   StoreWWMRegisters(WWMScratchRegs);
 
   auto EnableAllLanes = [&]() {
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
   };
 
   if (!WWMCalleeSavedRegs.empty()) {
@@ -1043,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillStores(
     TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
   } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
-    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
         .addReg(ScratchExecCopy, RegState::Kill);
     LiveUnits.addReg(ScratchExecCopy);
   }
@@ -1092,6 +1092,7 @@ void SIFrameLowering::emitCSRSpillRestores(
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
 
   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
@@ -1138,16 +1139,14 @@ void SIFrameLowering::emitCSRSpillRestores(
     Register OrigExec = Return.getOperand(0).getReg();
 
     if (!WWMScratchRegs.empty()) {
-      unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+      BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg)
           .addReg(OrigExec)
           .addImm(-1);
       RestoreWWMRegisters(WWMScratchRegs);
     }
 
     // Restore original EXEC.
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec);
     return;
   }
 
@@ -1159,8 +1158,7 @@ void SIFrameLowering::emitCSRSpillRestores(
   RestoreWWMRegisters(WWMScratchRegs);
   if (!WWMCalleeSavedRegs.empty()) {
     if (ScratchExecCopy) {
-      unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+      BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
     } else {
       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
                                              /*IsProlog*/ false,
@@ -1171,8 +1169,7 @@ void SIFrameLowering::emitCSRSpillRestores(
   RestoreWWMRegisters(WWMCalleeSavedRegs);
   if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
-    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
+    BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
         .addReg(ScratchExecCopy, RegState::Kill);
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..74f348b6771cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14,6 +14,7 @@
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -4809,6 +4810,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
   MachineFunction *MF = OrigBB.getParent();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
   MachineBasicBlock::iterator I = LoopBB.begin();
 
   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
@@ -4840,10 +4842,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
       .addReg(Idx.getReg(), 0, Idx.getSubReg());
 
   // Update EXEC, save the original EXEC value to VCC.
-  BuildMI(LoopBB, I, DL,
-          TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
-                                 : AMDGPU::S_AND_SAVEEXEC_B64),
-          NewExec)
+  BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
       .addReg(CondReg, RegState::Kill);
 
   MRI.setSimpleHint(NewExec, CondReg);
@@ -4870,13 +4869,9 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
   }
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   MachineInstr *InsertPt =
-      BuildMI(LoopBB, I, DL,
-              TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
-                                     : AMDGPU::S_XOR_B64_term),
-              Exec)
-          .addReg(Exec)
+      BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
+          .addReg(LMC.ExecReg)
           .addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -4911,15 +4906,14 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
 
   // Save the EXEC mask
   // clang-format off
-  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
-      .addReg(Exec);
+  BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
+      .addReg(LMC.ExecReg);
   // clang-format on
 
   auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
@@ -4939,7 +4933,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
   LoopBB->addSuccessor(LandingPad);
   MachineBasicBlock::iterator First = LandingPad->begin();
   // clang-format off
-  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
+  BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
       .addReg(SaveExec);
   // clang-format on
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index df638bd65bdaa..7800897322d24 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "SIInstrInfo.h"
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNHazardRecognizer.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
@@ -1195,6 +1196,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
                                      Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
+  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
          "Not a VGPR32 reg");
 
@@ -1213,10 +1215,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     switch (Cond[0].getImm()) {
     case SIInstrInfo::SCC_TRUE: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(1)
-        .addImm(0);
+      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
         .addImm(0)
         .addReg(FalseReg)
@@ -1227,10 +1226,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     }
     case SIInstrInfo::SCC_FALSE: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(0)
-        .addImm(1);
+      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
         .addImm(0)
         .addReg(FalseReg)
@@ -1270,13 +1266,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     case SIInstrInfo::EXECNZ: {
       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
-                                            : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
-        .addImm(0);
-      BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
-                                            : AMDGPU::S_CSELECT_B64), SReg)
-        .addImm(1)
-        .addImm(0);
+      BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
...
[truncated]

arsenm · 2025-08-21T11:53:09Z

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h

how about static constexpr getWave32/getWave64 with member initializer lists?

I second that.

I've changed the implementation to use static constexpr and initializer lists.
You'll have to let me know if it fit with what you were imagining?

I removed the static objects entirely as well, which might not have been the intend of your comment.
This perhaps creates a small performance concern if the compiler actually initializes an entire instance of the constants for some of the very localized uses, e.g. in SIInstrInfo.cpp.

cdevadas

Thanks for working on it.

cdevadas · 2025-08-21T16:15:31Z

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h

I second that.

arsenm · 2025-08-22T03:42:19Z

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h

I meant something more like:

static constexpr LaneMaskConstants Wave32 = {} static constexpr LaneMaskConstants Wave64 = {}; static const LaneMaskConstants &get(const GCNSubtarget &ST) { return isWave32() ? Wave32 : Wave64; }

You could also push this the setting into a proper constructor with a flag so you have the name op the opcode with the value

or

template <bool IsWave32> LaneMaskConstants:() : XorOpc(IsWave32 ? ...), ...

Thanks, that makes sense!
I've made changes based no your suggests.

perlfu · 2025-09-01T05:18:06Z

Ping

perlfu · 2025-09-12T11:47:19Z

Ping

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h

cdevadas

LGTM.

Create utility mechanism for finding wave size dependent opcodes used to manipulate exec/lane masks.

perlfu · 2025-09-16T02:53:06Z

Rebased and apply suggestions.
Will merge shortly following completion of local testing.

perlfu requested review from arsenm, cdevadas, jayfoad, nhaehnle and rovka August 21, 2025 10:07

llvmbot added the backend:AMDGPU label Aug 21, 2025

arsenm reviewed Aug 21, 2025

View reviewed changes

cdevadas reviewed Aug 21, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h Outdated

Copy link

Collaborator

cdevadas Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I second that.

arsenm reviewed Aug 22, 2025

View reviewed changes

perlfu force-pushed the amdgpu-lane-mask-utils branch from f39f7c6 to 4867f4d Compare September 1, 2025 05:17

arsenm approved these changes Sep 13, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h Outdated Show resolved Hide resolved

cdevadas approved these changes Sep 15, 2025

View reviewed changes

[AMDGPU] Refactor out common exec mask opcode patterns (NFCI)

414af5a

Create utility mechanism for finding wave size dependent opcodes used to manipulate exec/lane masks.

perlfu force-pushed the amdgpu-lane-mask-utils branch from 4867f4d to 414af5a Compare September 16, 2025 02:51

perlfu enabled auto-merge (squash) September 16, 2025 03:20

perlfu merged commit fdb06d9 into llvm:main Sep 16, 2025
9 checks passed

[AMDGPU] Refactor out common exec mask opcode patterns (NFCI) #154718

[AMDGPU] Refactor out common exec mask opcode patterns (NFCI) #154718

Uh oh!

Conversation

perlfu commented Aug 21, 2025

Uh oh!

llvmbot commented Aug 21, 2025

Uh oh!

arsenm Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

cdevadas Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

perlfu Aug 22, 2025

Choose a reason for hiding this comment

Uh oh!

cdevadas left a comment

Choose a reason for hiding this comment

Uh oh!

cdevadas Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Aug 22, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Aug 22, 2025

Choose a reason for hiding this comment

Uh oh!

perlfu Aug 22, 2025

Choose a reason for hiding this comment

Uh oh!

perlfu commented Sep 1, 2025

Uh oh!

perlfu commented Sep 12, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

cdevadas left a comment

Choose a reason for hiding this comment

Uh oh!

perlfu commented Sep 16, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants